Example #1
0
def get_dxp(start, end, news_url, portables_url):
    """Calculate the time until the start or end of a Double XP Weekend"""
    utc_time = datetime.utcnow()
    start_date = parser.parse(start)
    end_date = parser.parse(end)
    if utc_time < start_date:
        delta = relativedelta(start_date, utc_time)
        if delta.days >= 1:
            return '1. [DXP Weekend starts in: **%(days)d day, %(hours)d hours**](' \
                    % delta.__dict__ + news_url + \
                    '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n'
        return '1. [DXP Weekend starts in: **%(hours)d hours**](' \
                % delta.__dict__ + news_url + \
                '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n'
    elif utc_time > end_date:
        return '1. DXP Weekend has ended.'
    else:
        delta = relativedelta(end_date, utc_time)
        if delta.days > 1:
            return '1. [DXP Weekend is LIVE: **%(days)d days, %(hours)d hours to go**]('  \
                    % delta.__dict__ + news_url + \
                    '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n'
        elif delta.days == 1:
            return '1. [DXP Weekend is LIVE: **%(days)d day, %(hours)d hours to go**](' \
                    % delta.__dict__ + news_url + \
                    '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n'
        return '1. [DXP Weekend is LIVE: **%(hours)d hours to go**](' \
                % delta.__dict__ + news_url + \
                '#dxp) \n \n2. [Portables & Boxes FC Information](' + portables_url + ') \n'
    def add_document(self, writer, file_path, config):
        file_name = str(
            file_path.replace(".", " ").replace("/", " ").replace(
                "\\", " ").replace("_", " ").replace("-", " "))
        # read file content
        with codecs.open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            path = str(file_path)

        # parse markdown fields
        parser = MarkdownParser()
        parser.parse(content, config)

        modtime = os.path.getmtime(path)
        print(
            "adding to index: path: %s size:%d tags:'%s' headlines:'%s' modtime=%d"
            % (path, len(content), parser.tags, parser.headlines, modtime))
        writer.add_document(path=path,
                            filename=file_name,
                            headlines=parser.headlines,
                            tags=parser.tags,
                            content=content,
                            doubleemphasiswords=parser.doubleemphasiswords,
                            emphasiswords=parser.emphasiswords,
                            time=modtime)
Example #3
0
def analyse(rootdir):
    list = os.listdir(rootdir)
    new = True
    for file in list:
        if not file.endswith('.xml'):
            continue
        if new:
            parser_new.parse(os.path.join(rootdir, file))
            new = False
        else:
            parser.parse(os.path.join(rootdir, file))
Example #4
0
def select_date(tree, element):
    try:
        text = tree.select(element)[0].get_text().replace('\"', '')
        year = parser.parse(text).year
    except Exception as err:
        year = ''
        writelog('    PARSE ERROR', element, err)
    return year
def extract_np(c, data, parser, mode):	
	text = word_tokenize(data)
	sentence = pos_tag(text)
	result = []
	parsed_sentence = parser.parse(sentence)
	# # Clearer visuals for debugging
	# print(parsed_sentence)
	# parsed_sentence.draw()
	for np in clean_np(parsed_sentence, mode):
		result.append(np)
	# This counts number of times the NPs appears in the input data(review + summary) 
	c.update(lower_and_lemma(result))
Example #6
0
  def crawl_domain(self, domain, depth, debug=False, limit=None, visited=set()):
    """
    Fetches a domain, and then crawls its internal pages until given depth.
    Returns a dictionary of url -> html code.
    """
    pages = {}
    base_domain = urllib.parse.urlparse( domain ).netloc
    
    html = self.fetch( domain, debug)
    if html is not None:
      pages[domain] = html
      visited.add( domain )

    else:
      if debug is True:
        print( "Impossible to crawl %s" % domain )
      return {}

    if depth > 0 and (limit is None or limit > 0):
      dom = None      
      parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
      
      try:
        dom = parser.parse( html )
      except Exception as e:
        if debug is True:
          print( e )
        return {}
      
      links = html5wrapper.extract_doc_links( dom )
      for key in links:
        # We do not want anchors to be crawled
        if len(key) < 1 or key[0] == '#':
          continue
        
        url = urllib.parse.urljoin(domain, key)
        
        # Trying to get eventual file extension, and to check its validity
        parts = url.split(".")
        ext = parts[ len(parts) - 1].strip().lower()
        if ext in self.badextensions:
          continue
        
        # Let's check if it's an internal link, and not an outgoing one
        if base_domain == urllib.parse.urlparse( url ).netloc and \
           url not in visited and (limit is None or limit > 0):
             visited.add( url )
             pages.update( self.crawl_domain(url, depth - 1, debug, limit, visited) )
        
             if limit is not None:
               limit -= 1

    return pages
Example #7
0
def replace_images(msg, addr):
    parser = commonmark.Parser()
    ast = parser.parse(msg["data"]["raw"])
    ripper = html_image_ripper()
    ripper.feed(msg["data"]["cooked"])
    for cur, entering in ast.walker():
        if cur.t == "image" and entering:
            cur.t = "link"
            dest = ripper.images.pop(0)
            if dest.startswith("/"):
                dest = addr + dest
            cur.destination = dest
    renderer = commonmark_extensions.plaintext.CommonMarkToCommonMarkRenderer()
    return renderer.render(ast)
Example #8
0
    def work(self):
        while True:
            try:
                url = self._master.getTask()
                if not url:
                    break

                page = load(url)
                text, names = parser.parse(page)
                name = extractName(url)
                store(page, HTML_PATH + name, url)
                store(text, TEXT_PATH + name, url)
                self._master.report(url)
                self._master.pushUrls(namesToUrls(names))

            except (urllib.error.URLError, urllib.error.HTTPError) as exception:
                log.error('Load "%s"... %s %s', url, exception.reason, exception.code)

        log.debug('Shutdown thread "%s"...', self.name)
Example #9
0
def stream_reuters_documents(data_path='/Users/newuser/Downloads/reuters21578'):
    """Iterate over documents of the Reuters dataset.

    The Reuters archive will automatically be downloaded and uncompressed if
    the `data_path` directory does not exist.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """

    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                    'reuters21578-mld/reuters21578.tar.gz')
    ARCHIVE_FILENAME = 'reuters21578.tar.gz'

    if data_path is None:
        data_path = os.path.join(get_data_home(), "reuters")
    if not os.path.exists(data_path):
        """Download the dataset."""
        print("downloading dataset (once and for all) into %s" %
              data_path)
        os.mkdir(data_path)

        def progress(blocknum, bs, size):
            total_sz_mb = '%.2f MB' % (size / 1e6)
            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
            if _not_in_sphinx():
                print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
                      end='')

        archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
        urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
                                   reporthook=progress)
        if _not_in_sphinx():
            print('\r', end='')
        print("untarring Reuters dataset...")
        tarfile.open(archive_path, 'r:gz').extractall(data_path)
        print("done.")

    parser = ReutersParser()
    for filename in glob(os.path.join(data_path, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            yield doc
Example #10
0
def nltk_parser(txt):
    myblob = TextBlobDE(txt)
    sent = [x[1] for x in myblob.tags]
    sent_text = [x[0] for x in myblob.tags]
    cfg_grammar = nltk.CFG.fromstring("""
    S -> NP VP | S CC S
    NP -> 'DT' N | 'DT' N PP | 'PRP' | N | 'PRP$'
    VP -> V NP | V NP PP | V ADJP
    ADJP -> 'RB' 'JJ' | 'JJ'
    PP -> P NP
    N -> 'NN' | 'NNP' | 'NNS' | 'FW'
    V -> 'VBN' | 'VB' | 'MD'
    P -> 'IN' | 'TO'
    CC -> 'CC'
    O -> 'RP' | 'WDT' | 'TRUNC' | 'CD'
    """)
    
    parser = nltk.parse.ChartParser(cfg_grammar)
    for tree in parser.parse(sent):
        print(tree)
        tree.draw()
Example #11
0
    """Extract syntax highlighting choices out of the paste page."""

    def handle_starttag(self, tag, attrs):
        """The parser enters a tag."""
        if tag == 'option':
            self._current_tag = dict(attrs)['value']

    def handle_data(self, data):
        """The parser processes data inside a tag."""
        if self._current_tag is not None:
            self._resultdict[data] = self._current_tag

    def handle_endtag(self, tag):
        """The parser leaves a tag."""
        if tag == 'option':
            self._current_tag = None

    def parse(self, data):
        """Main method."""
        self._current_tag = None
        self._resultdict = {}
        self.feed(data)
        return self._resultdict


parser = PasteOfCodeParser()
data = parser.parse(requests.get('http://paste.ofcode.org/').text)

json.dump(data, sys.stdout, indent=4)
print()
Example #12
0
    def handle(self, *args, **options):
        if not options.get('input_json') or not options.get('app'):
            raise CommandError('Both --input_json and --app arguments are mandatory')

        with open(options.get('input_json'), 'rb') as f:
            groups, articles = json.loads(f.read().decode('UTF-8'))

        fixtures_path = os.path.join(settings.BASE_DIR, options.get('app'), 'fixtures')

        if not os.path.exists(fixtures_path):
            os.makedirs(fixtures_path)

        self.fill_groups(groups)

        with open(os.path.join(fixtures_path, "categories.json"), "wb") as f:
            f.write(bytes(
                json.dumps(self.all_groups, ensure_ascii=False, indent=4)
            , 'UTF-8'))

        self.stdout.write('Successfully built categories.json fixture with {0} records'.format(len(self.all_groups)))

        all_articles = []

        article_id = 1


        for article in articles:
            parsedDate = parser.parse(article['properties']['60']['value'], dayfirst=True, yearfirst=False)

            preview = article['properties']['CML2_PREVIEW_TEXT']['value']

            if preview and len(preview) >= 252:
                 preview = preview[:252] + '...'

            all_articles.append({
                'model': 'news.article',
                'pk': article_id,
                'fields': {
                    'enabled': True,
                    'publish_date': parsedDate.strftime('%Y-%m-%dT%H:%M:%SZ'),
                    'slug': article['properties']['CML2_CODE']['value'],
                    'title': article['title'],
                    'announcement': preview,
                    'description': self.html.unescape(article['properties']['CML2_DETAIL_TEXT']['value'] or ''),
                    'source': article['properties']['62']['value'],
                    'author': article['properties']['64']['value'],
                    'translated_by': article['properties']['65']['value'],
                    'photo': article['properties']['54']['value'],
                    'photo_copyrights': article['properties']['66']['value'],
                    'views': 0,
                    'order': article['properties']['CML2_SORT']['value'],
                    #'tags': article['tags'],
                    'categories': [self.old_to_new_groups[gid] for gid in article['groups']]
                },
            })

            article_id += 1


        with open(os.path.join(fixtures_path, "articles.json"), "wb") as f:
            f.write(bytes(
                json.dumps(all_articles, ensure_ascii=False, indent=4)
            , 'UTF-8'))

        self.stdout.write('Successfully built articles.json fixture with {0} records'.format(len(articles)))
Example #13
0
    def crawl_domain(self,
                     domain,
                     depth,
                     debug=False,
                     limit=None,
                     visited=set()):
        """
    Fetches a domain, and then crawls its internal pages until given depth.
    Returns a dictionary of url -> html code.
    """
        pages = {}
        base_domain = urllib.parse.urlparse(domain).netloc

        html = self.fetch(domain, debug)
        if html is not None:
            pages[domain] = html
            visited.add(domain)

        else:
            if debug is True:
                print("Impossible to crawl %s" % domain)
            return {}

        if depth > 0 and (limit is None or limit > 0):
            dom = None
            parser = html5lib.HTMLParser(
                tree=html5lib.treebuilders.getTreeBuilder("dom"))

            try:
                dom = parser.parse(html)
            except Exception as e:
                if debug is True:
                    print(e)
                return {}

            links = html5wrapper.extract_doc_links(dom)
            for key in links:
                # We do not want anchors to be crawled
                if len(key) < 1 or key[0] == '#':
                    continue

                url = urllib.parse.urljoin(domain, key)

                # Trying to get eventual file extension, and to check its validity
                parts = url.split(".")
                ext = parts[len(parts) - 1].strip().lower()
                if ext in self.badextensions:
                    continue

                # Let's check if it's an internal link, and not an outgoing one
                if base_domain == urllib.parse.urlparse( url ).netloc and \
                   url not in visited and (limit is None or limit > 0):
                    visited.add(url)
                    pages.update(
                        self.crawl_domain(url, depth - 1, debug, limit,
                                          visited))

                    if limit is not None:
                        limit -= 1

        return pages