Example #1
0
def batch_process(file_dict, dbpath, memory):
    """Parses, resolves corefs, and extracts triplets from file in a
    directory.
    """
    from threading import Thread
    try:

        # Parse files with progress bar
        t = Thread(target=monitor_progress, kwargs={
            'num_files':len(file_dict)
            })
        t.daemon = True
        t.start()

        print "Starting corenlp. Wait a few moments."
        this_dir = os.path.dirname(os.path.realpath(__file__))
        corenlp_path = os.path.join(this_dir,
                "stanford-corenlp-full-2013-11-12")
        log_path = os.path.join(TEMP, 'corenlp_log.txt')
        parses = corenlp.batch_parse(TEMP, log_path, memory=memory,
                corenlp_path=corenlp_path)

        # Extract triplets and save to db
        pbar = ProgressBar(len(file_dict))
        file_name = ''
        for parse_dict in parses:
            if not pbar.has_started():
                print "Extracting triplets..."
                pbar.start()
            article_dict = file_dict[parse_dict['file_name']]

            # add article to db
            database.save_article(article_dict, dbpath)

            # resolve corefs and extract triplets
            triplets = process_parsed(parse_dict)

            # save triplet to db
            if len(triplets) > 0:
                for triplet in triplets:
                    triplet['article_path'] = article_dict['path']
                    triplet['pub_date'] = article_dict['pub_date']

                    database.save_triplet(triplet, dbpath)
            if parse_dict['file_name'] != file_name:
                file_name = parse_dict['file_name']
                pbar.tick()
    finally:  # remove temp files
        for root, dirs, fnames in os.walk(TEMP):
            for fname in fnames:
                p = os.path.join(root, fname)
                os.remove(p)
Example #2
0
def batch_process(directory):
    """Parses, resolves corefs, and extracts triplets from file in a
    directory.
    """
    from threading import Thread
    try:
        file_dict = preprocess_dir(directory)

        # Parse files with progress bar
        t = Thread(target=monitor_progress, kwargs={
            'num_files':len(file_dict)
            })
        t.daemon = True
        t.start()
        print "Starting corenlp. Wait a few moments."
        parses = corenlp.batch_parse(config.TEMP, memory=config.memory)

        # Extract triplets and save to db
        pbar = ProgressBar(len(file_dict))
        file_name = ''
        for parse_dict in parses:
            if not pbar.has_started():
                print "Extracting triplets..."
                pbar.start()
            article_dict = file_dict[parse_dict['file_name']]

            # add article to db
            database.save_article(article_dict)

            # resolve corefs and extract triplets
            triplets = process_parsed(parse_dict)

            # save triplet to db
            if len(triplets) > 0:
                for triplet in triplets:
                    triplet['article_path'] = article_dict['path']
                    triplet['pub_date'] = article_dict['pub_date']

                    database.save_triplet(triplet)
            if parse_dict['file_name'] != file_name:
                file_name = parse_dict['file_name']
                pbar.tick()
    finally:  # remove temp files
        for root, dirs, fnames in os.walk(config.TEMP):
            for fname in fnames:
                p = os.path.join(root, fname)
                os.remove(p)
Example #3
0
def monitor_progress(num_files):
    """Watches a log file changes and draws a progress bar
    in the terminal.
    """
    from time import sleep
    import sys

    pbar = ProgressBar(num_files)

    # Try three times to open the file
    for x in range(3):
        try:
            f = open(os.path.join(config.DATA, 'corenlp_log.txt'))
            break
        except IOError:
            sleep(4)
        print "ERROR: Unable to find corenlp_log.txt"


    fname = ''
    while True:
        f.seek(0) # Refresh log.
        try:
            line = f.readlines()[-1]
        except IndexError:
            sleep(1)
            continue
        

        if line and line.strip().startswith('Annotating file'):
            # Once we find the right line, start the pbar
            if not pbar.has_started():
                print "Sending files to StanfordCoreNLP..."
                pbar.start()

            # Ensure corenlp is working on a new file
            new_fname = line.split('/')[-1].split(' ')[0]
            if pbar.has_started() and new_fname != fname:
                fname = new_fname
                pbar.tick()
                
        if pbar.is_done():
            # Stop the thread
            return
        sleep(.1)
Example #4
0
def make_graph(date_range=False, giant=False, show_pbar=True):
    conn = sqlite3.connect(config.DB)

    if date_range is False:
        query = """SELECT article_path, subject, predicate, obj, sentence,
        sentiment, pub_date, subj_named, obj_named
        FROM triplets
        WHERE is_reliable = 1"""
        triplets = conn.execute(query).fetchall()
    else:
        query = """SELECT article_path, subject, predicate, obj, sentence,
        sentiment, pub_date, subj_named, obj_named FROM triplets
        WHERE is_reliable = 1
        AND pub_date >= ?
        AND pub_date < ?"""
        params = (date_range[0], date_range[1])
        triplets = conn.execute(query, params).fetchall()

    G = ig.Graph(directed=True)

    if show_pbar:
        pbar = ProgressBar(len(triplets))
        pbar.start()

    # Generate graph
    for article_path, subject, pred, obj, sentence, sentiment, pub_date, \
            subj_named, obj_named in triplets:

        if subject.lower() != obj.lower():
            # Sentence = sentence.encode('ascii', 'ignore')
            add_igraph_vertex(subject, subj_named, G)
            add_igraph_vertex(obj, obj_named, G)
            add_igraph_edge(
                article_path, subject, pred, obj, sentence, sentiment, G)

        if show_pbar:
            pbar.tick()

    conn.close()
    if giant:
        return G.clusters(mode=ig.WEAK).giant()
    else:
        return G
Example #5
0
def set_reliable(frequency_threshold, unnamed_threshold):
    clear_reliable()

    query = "UPDATE triplets SET is_reliable = 1 WHERE ROWID = ?"
    conn = sqlite3.connect(config.DB, isolation_level=None)

    reliable = get_reliable(
            frequency_threshold, weight_threshold, unnamed_threshold)

    print "Setting reliable..."
    pbar = ProgressBar(len(reliable)) #len(reliable.fetchall()))
    pbar.start()
    for rowid in reliable:
        params = (rowid,)
        conn.execute(query, params)
        pbar.tick()

    conn.commit()
    conn.close()

    print "Done!"
Example #6
0
    def scrape(self, pause=(30, 60)):
        """Scraper's main loop. Pulls a news website's search result page
        via format url and extracts article links with extract_links().
        Then it loops through those extracted links and pulls out the article
        with extract_article() and metadata with extract_metadata(). Lastly,
        it yields the results in a dictionary.
        Arguments:
            pause (tuple): Program will pause for a random number of seconds
                between pause[0] and pause[1].
            pause (None): Program will not pause at all. NOT RECOMMENDED!
        Yields:
            dict -- a dictionary containing data returned by extract_metadata()
                and the article content under the 'content' key.
        """
        # Start the progress bar
        pbar = ProgressBar(self._num_articles)
        pbar.start()

        count = 0
        # Loop through search result pages.
        for i in itertools.count(self._start, self._step):
            # Stop if we have desired number of articles.
            if count > self._num_articles:
                break

            url = self._format_url.format(i)


            # log search results page turn
            self.logger.info("Extracting search results from {}".format(url))

            # Begin scraping
            try:
                # Extract search result URLs
                rawsearchresults = requests.get(url, headers=self.headers)
                searchresults = self.extract_links(rawsearchresults.text, url)

                # Add referer to headers to look like a real boy
                self.headers['Referer'] = url

                # Walk throguh search results
                for link in searchresults:
                    # Stop if we have desired number of articles.
                    if count > self._num_articles:
                        break

                    self.logger.info("Extracting article from {}".format(link))

                    # Download article
                    raw_article = requests.get(link, headers=self.headers)

                    # Extract article / remove boilerplate
                    content = self.extract_article(raw_article.text, link)

                    # Extract various metadata
                    article = self.extract_metadata(raw_article.text, link)

                    # add article content to metadata dictionary.
                    article['content'] = content

                    if pause:
                        sleep(randint(pause[0], pause[1]))

                    count += 1
                    pbar.tick()
                    yield article

            except ParseError as e:
                # Log error, then continue
                self.logger.error(str(e))

                # Update counter and progressbar
                count += 1
                pbar.tick()
            except Exception as e:
                # Log error, then exit
                self.logger.error('Error occured while in scrape()',
                        exc_info=True)
                raise e