Ejemplo n.º 1
0
def load_sources_from_file(filepath='manage/sources.txt'):
    """
    Load feeds from a text file.
    Each line should be the url to the source
    you want to add.
    """
    logger.info('Loading sources from file. This may take awhile...')
    add_sources([line for line in open(filepath, 'r')])
Ejemplo n.º 2
0
def evaluate():
    if os.environ.get('FLASK_ENV') == 'TESTING':
        logger.info('Preparing evaluation database...')
        db.create_all()

        evaluate_clustering()

        logger.info('Cleaning up evaluation database...')
        db.session.remove()
        db.drop_all()
    else:
        logger.error('This function must be run with FLASK_ENV=TESTING.')
Ejemplo n.º 3
0
def evaluate():
    if os.environ.get('FLASK_ENV') == 'TESTING':
        logger.info('Preparing evaluation database...')
        db.create_all()

        evaluate_clustering()

        logger.info('Cleaning up evaluation database...')
        db.session.remove()
        db.drop_all()
    else:
        logger.error('This function must be run with FLASK_ENV=TESTING.')
Ejemplo n.º 4
0
def download(force=False):
    """
    Downloads and extracts the desired DBpedia datasets.

    They are extracted to the app's `DATASETS_PATH` value.
    """

    # Get the desired dataset urls.
    dataset_urls = [dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS)]

    for dataset_url in dataset_urls:
        # dc = decompressed
        dc_filepath = os.path.join(DATASETS_PATH,
                os.path.basename(dataset_url)[:-4]) # remove '.bz2'

        if os.path.exists(dc_filepath) and not force:
            logger.warn('File exists, not re-downloading and extracting. You can force by passing `force=True`.')
            continue

        # Download the dataset.
        logger.info('Downloading knowledge dataset from {0}'.format(dataset_url))
        filepath = gullet.download(dataset_url, '/tmp/')
        logger.info('Downloaded to {0}'.format(filepath))

        # Decompress the files.
        logger.info('Extracting to {0}'.format(dc_filepath))
        with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file:
            for data in iter(lambda : file.read(100 * 1024), b''):
                dc_file.write(data)

        # Clean up.
        os.remove(filepath)
    logger.info('Downloading and extraction complete.')
Ejemplo n.º 5
0
def collect():
    """
    Fetch articles from the sources,
    and save (or update) to db.
    """
    results = []

    logger.info('Fetching articles...')
    print('collecting')

    # Fetch entries for each source
    for source in Source.query.all():
        try:
            logger.info('Fetching from {0}...'.format(source.ext_url))
            articles = feed.articles(source)

            # Check for existing copy.
            for article in articles:
                if not Article.query.filter_by(ext_url=article.url).count():
                    db.session.add(article)
                results.append(article)

        except feed.SAXException as e:
            # Error with the feed, make a note.
            logger.info('Error fetching from {0}.'.format(source.ext_url))
            source.errors += 1

    logger.info('Finished fetching articles.')

    db.session.commit()

    return results
Ejemplo n.º 6
0
def active():
    """
    Get info about currently executing tasks.
    """
    try:
        active_tasks = celery.control.inspect().active()
        if not active_tasks:
            logger.info('No active tasks.')
            return False
    except IOError as e:
        logger.error('Error connecting to MQ. Check that it is running.')
        return False

    logger.info('There are {0} executing tasks.'.format(len(active_tasks)))
    return active_tasks
Ejemplo n.º 7
0
def active():
    """
    Get info about currently executing tasks.
    """
    try:
        active_tasks = celery.control.inspect().active()
        if not active_tasks:
            logger.info('No active tasks.')
            return False
    except IOError as e:
        logger.error('Error connecting to MQ. Check that it is running.')
        return False

    logger.info('There are {0} executing tasks.'.format(len(active_tasks)))
    return active_tasks
Ejemplo n.º 8
0
    def _iterate_pages(self):
        """
        Parses out and yields pages from the dump.
        Only yields pages that are in
        namespace=0 (i.e. articles).
        """
        for elem in self.iterate('page'):
            # Check the namespace,
            # only namespace 0 are articles.
            # https://en.wikipedia.org/wiki/Wikipedia:Namespace
            ns = int(self._find(elem, 'ns').text)
            if ns == 0:
                self.num_docs += 1
                yield elem

        logger.info('There are {0} docs in this dump.'.format(self.num_docs))
Ejemplo n.º 9
0
def create_sources(filepath):
    """
    Load feeds from a JSON file.
    It should consist of an dict of source name => list of feeds like so::

        {
            'The New York Times': [
                'http//www.nytimes.com/services/xml/rss/nyt/World.xml',
                'http//www.nytimes.com/services/xml/rss/nyt/politics.xml'
            ]
        }
    """
    logger.info('Loading sources from file. This may take awhile...')
    sources = open(filepath, 'r')
    raw_sources = json.load(sources)
    feed.add_sources(raw_sources)
Ejemplo n.º 10
0
def create_sources(filepath):
    """
    Load feeds from a JSON file.
    It should consist of an dict of source name => list of feeds like so::

        {
            'The New York Times': [
                'http//www.nytimes.com/services/xml/rss/nyt/World.xml',
                'http//www.nytimes.com/services/xml/rss/nyt/politics.xml'
            ]
        }
    """
    logger.info('Loading sources from file. This may take awhile...')
    sources = open(filepath, 'r')
    raw_sources = json.load(sources)
    feed.add_sources(raw_sources)
Ejemplo n.º 11
0
    def _iterate_pages(self):
        """
        Parses out and yields pages from the dump.
        Only yields pages that are in
        namespace=0 (i.e. articles).
        """
        for elem in self.iterate('page'):
            # Check the namespace,
            # only namespace 0 are articles.
            # https://en.wikipedia.org/wiki/Wikipedia:Namespace
            ns = int(self._find(elem, 'ns').text)
            if ns == 0:
                self.num_docs += 1
                yield elem

        logger.info('There are {0} docs in this dump.'.format(self.num_docs))
Ejemplo n.º 12
0
    def fetch_dump(self):
        """
        Downloads this instance's Wikipedia dump to replace
        this instance's current file.
        """

        # Default dump files.
        base = 'http://dumps.wikimedia.org/enwiki/latest/'
        pages = 'enwiki-latest-pages-articles.xml.bz2'

        # Build a default url if one is not specified.
        if not self.url:
            self.url = '{0}{1}'.format(base, pages)

        # Download!
        logger.info('Fetching pages dump from {0}'.format(self.url))
        self.download(self.url)
Ejemplo n.º 13
0
    def _generate_tfidf(self, docs):
        """
        Generate the TF-IDF representations for all the digested docs.

        Args:
            | docs (list)       -- see `_prepare_tfidf`
        """
        logger.info('Page processing complete. Generating TF-IDF representations.')

        doc_ids, corpus_counts = self._prepare_tfidf(docs)

        # Iterate over all docs
        # the specified docs.
        for doc_id in doc_ids:
            self._calculate_tfidf(doc_id, corpus_counts)

        logger.info('TF-IDF calculations completed!')
Ejemplo n.º 14
0
    def fetch_dump(self):
        """
        Downloads this instance's Wikipedia dump to replace
        this instance's current file.
        """

        # Default dump files.
        base = 'http://dumps.wikimedia.org/enwiki/latest/'
        pages = 'enwiki-latest-pages-articles.xml.bz2'

        # Build a default url if one is not specified.
        if not self.url:
            self.url = '{0}{1}'.format(base, pages)

        # Download!
        logger.info('Fetching pages dump from {0}'.format(self.url))
        self.download(self.url)
Ejemplo n.º 15
0
    def _generate_tfidf(self, docs):
        """
        Generate the TF-IDF representations for all the digested docs.

        Args:
            | docs (list)       -- see `_prepare_tfidf`
        """
        logger.info(
            'Page processing complete. Generating TF-IDF representations.')

        doc_ids, corpus_counts = self._prepare_tfidf(docs)

        # Iterate over all docs
        # the specified docs.
        for doc_id in doc_ids:
            self._calculate_tfidf(doc_id, corpus_counts)

        logger.info('TF-IDF calculations completed!')
Ejemplo n.º 16
0
def collect(feed):
    """
    Fetch articles from the specified feed,
    and save to db.
    """

    try:
        logger.info('Fetching from {0}...'.format(feed.ext_url))

        def commit_article(article):
            db.session.add(article)

        get_articles(feed, commit_article)
        db.session.commit()

    except SAXException as e:
        # Error with the feed, make a note.
        logger.info('Error fetching from {0}.'.format(feed.ext_url))
        feed.errors += 1
        db.session.commit()
Ejemplo n.º 17
0
def collect(feed):
    """
    Fetch articles from the specified feed,
    and save to db.
    """

    try:
        logger.info('Fetching from {0}...'.format(feed.ext_url))

        def commit_article(article):
            db.session.add(article)

        get_articles(feed, commit_article)
        db.session.commit()

    except SAXException as e:
        # Error with the feed, make a note.
        logger.info('Error fetching from {0}.'.format(feed.ext_url))
        feed.errors += 1
        db.session.commit()
Ejemplo n.º 18
0
def digest(force=False):
    """
    Digests downloaded DBpedia `ttl` (Turtle) dumps
    using Apache Jena's `tdbloader2`.

    This digested data can then be interfaced via
    Apache Jena's Fuseki server (see `argos.core.knowledge`).

    Note: `tdbloader2` only runs properly on Unix systems.
    """

    knowledge_path = os.path.join(DATASETS_PATH, 'knodb')
    logger.info('Digesting the datasets to {0}...'.format(knowledge_path))

    if os.path.exists(knowledge_path):
        if not force:
            logger.warn('It looks like a knowledge database already exists, not rebuilding it. You can force by passing `force=True`.')
            return
        logger.warn('Existing knowledge database found. Removing...')
        shutil.rmtree(knowledge_path)

    loader_path = os.path.expanduser(os.path.join(APP['JENA_PATH'], 'bin/tdbloader2'))
    cmd = [loader_path, '--loc', knowledge_path]
    datasets = [os.path.join(DATASETS_PATH, dataset) for dataset in os.listdir(DATASETS_PATH) if dataset.endswith('.ttl') and any(setname in dataset for setname in DESIRED_DATASETS)]
    logger.info('Using the datasets: {0}'.format(' '.join(datasets)))

    cmd += datasets
    subprocess.call(cmd)
    logger.info('Digestion complete.')
Ejemplo n.º 19
0
def workers():
    """
    Get info about currently available Celery workers.
    If none are available, or there are issues connecting
    to the MQ, returns False.

    Returns:
        | dict      -- dict of available workers.
        OR
        | bool      -- False if no available workers, or cannot connect ot MQ.
    """

    try:
        # Get info on available workers.
        workers = celery.control.inspect().stats()
        if not workers:
            logger.error('No Celery workers available.')
            return False
    except IOError as e:
        logger.error('Error connecting to MQ. Check that it is running.')
        return False

    logger.info('There are {0} workers available.'.format(len(workers)))
    return workers
Ejemplo n.º 20
0
def workers():
    """
    Get info about currently available Celery workers.
    If none are available, or there are issues connecting
    to the MQ, returns False.

    Returns:
        | dict      -- dict of available workers.
        OR
        | bool      -- False if no available workers, or cannot connect ot MQ.
    """

    try:
        # Get info on available workers.
        workers = celery.control.inspect().stats()
        if not workers:
            logger.error('No Celery workers available.')
            return False
    except IOError as e:
        logger.error('Error connecting to MQ. Check that it is running.')
        return False

    logger.info('There are {0} workers available.'.format(len(workers)))
    return workers
Ejemplo n.º 21
0
def download(force=False):
    """
    Downloads and extracts the desired DBpedia datasets.

    They are extracted to the app's `DATASETS_PATH` value.
    """

    # Get the desired dataset urls.
    dataset_urls = [
        dataset_url for dataset_url in get_dataset_urls()
        if any(setname in dataset_url for setname in DESIRED_DATASETS)
    ]

    for dataset_url in dataset_urls:
        # dc = decompressed
        dc_filepath = os.path.join(
            DATASETS_PATH,
            os.path.basename(dataset_url)[:-4])  # remove '.bz2'

        if os.path.exists(dc_filepath) and not force:
            logger.warn(
                'File exists, not re-downloading and extracting. You can force by passing `force=True`.'
            )
            continue

        # Download the dataset.
        logger.info(
            'Downloading knowledge dataset from {0}'.format(dataset_url))
        filepath = gullet.download(dataset_url, '/tmp/')
        logger.info('Downloaded to {0}'.format(filepath))

        # Decompress the files.
        logger.info('Extracting to {0}'.format(dc_filepath))
        with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath,
                                                              'rb') as file:
            for data in iter(lambda: file.read(100 * 1024), b''):
                dc_file.write(data)

        # Clean up.
        os.remove(filepath)
    logger.info('Downloading and extraction complete.')
Ejemplo n.º 22
0
    def digest(self):
        """
        Will process this instance's dump.
        """

        # Check if the specified file exists.
        if not exists(self.file):
            logger.info('Specified file {0} not found, fetching...'.format(
                self.file))
            self.fetch_dump()

        logger.info('Beginning digestion of pages.')

        # Process pages and collect their text content ("docs").
        docs = [self._process_page(elem) for elem in self._iterate_pages()]

        logger.info('Vectorizing the page documents...')
        # Vectorize the docs.
        doc_vecs = brain.vectorize(docs)

        # Testing
        #outfile = open('/Users/ftseng/Desktop/test.pickle', 'wb')
        #import pickle
        #pickle.dump(doc_vecs, outfile)

        # Pickle the docs to save to Mongo.
        #_doc_vecs = self.db().pickle(doc_vecs)
        #processed_name = self.url if self.url else self.file
        #self.db().add({'dump': processed_name, 'docs': _doc_vecs})
        #self.db().close()

        # Generate TF-IDF representation
        # of all docs upon completion.
        #self._generate_tfidf(docs)

        logger.info('Digestion complete!')

        if not self.silent:
            processed_name = self.url if self.url else self.file
            notify(
                'TF-IDF calculations complete for {0}!'.format(processed_name))
Ejemplo n.º 23
0
    def digest(self):
        """
        Will process this instance's dump.
        """

        # Check if the specified file exists.
        if not exists(self.file):
            logger.info('Specified file {0} not found, fetching...'.format(self.file))
            self.fetch_dump()

        logger.info('Beginning digestion of pages.')

        # Process pages and collect their text content ("docs").
        docs = [self._process_page(elem) for elem in self._iterate_pages()]

        logger.info('Vectorizing the page documents...')
        # Vectorize the docs.
        doc_vecs = brain.vectorize(docs)

        # Testing
        #outfile = open('/Users/ftseng/Desktop/test.pickle', 'wb')
        #import pickle
        #pickle.dump(doc_vecs, outfile)

        # Pickle the docs to save to Mongo.
        #_doc_vecs = self.db().pickle(doc_vecs)
        #processed_name = self.url if self.url else self.file
        #self.db().add({'dump': processed_name, 'docs': _doc_vecs})
        #self.db().close()

        # Generate TF-IDF representation
        # of all docs upon completion.
        #self._generate_tfidf(docs)

        logger.info('Digestion complete!')

        if not self.silent:
            processed_name = self.url if self.url else self.file
            notify('TF-IDF calculations complete for {0}!'.format(processed_name))
Ejemplo n.º 24
0
def digest(force=False):
    """
    Digests downloaded DBpedia `ttl` (Turtle) dumps
    using Apache Jena's `tdbloader2`.

    This digested data can then be interfaced via
    Apache Jena's Fuseki server (see `argos.core.knowledge`).

    Note: `tdbloader2` only runs properly on Unix systems.
    """

    knowledge_path = os.path.join(DATASETS_PATH, 'knodb')
    logger.info('Digesting the datasets to {0}...'.format(knowledge_path))

    if os.path.exists(knowledge_path):
        if not force:
            logger.warn(
                'It looks like a knowledge database already exists, not rebuilding it. You can force by passing `force=True`.'
            )
            return
        logger.warn('Existing knowledge database found. Removing...')
        shutil.rmtree(knowledge_path)

    loader_path = os.path.expanduser(
        os.path.join(APP['JENA_PATH'], 'bin/tdbloader2'))
    cmd = [loader_path, '--loc', knowledge_path]
    datasets = [
        os.path.join(DATASETS_PATH, dataset)
        for dataset in os.listdir(DATASETS_PATH)
        if dataset.endswith('.ttl') and any(setname in dataset
                                            for setname in DESIRED_DATASETS)
    ]
    logger.info('Using the datasets: {0}'.format(' '.join(datasets)))

    cmd += datasets
    subprocess.call(cmd)
    logger.info('Digestion complete.')
Ejemplo n.º 25
0
def evaluate_clustering():
    """
    Evaluate the clustering algorithm.
    """

    logger.info('Constructing expected clusters and articles...')
    expected_clusters = {}
    articles = []
    all_files = []

    # Collect all appropriate files.
    for dir, subdir, files in os.walk('manage/evaluate/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
    for dir, name, filepath in all_files:
        category = dir.split('/')[-1]
        f = open(filepath, 'r')
        article = Article(text=f.read(), title=name.split('/')[-1])
        expected_clusters.setdefault(category, []).append(article)
        articles.append(article)
        progress_bar(len(articles) / len(all_files) * 100)
    print('\n')

    logger.info('Will cluster {0} articles.'.format(len(articles)))
    logger.info('Expecting {0} clusters.'.format(len(
        expected_clusters.keys())))

    logger.info('Clustering...')
    p = cProfile.Profile()
    clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True)

    logger.info('Created {0} clusters.'.format(len(clusters)))

    logger.info('Cluster composition is as follows...')
    for c in clusters:
        logger.info([m.title for m in c.members])

    logger.info('Profiling statistics from the clustering...')
    ps = pstats.Stats(p)
    ps.sort_stats('time').print_stats(10)
Ejemplo n.º 26
0
def evaluate_clustering():
    """
    Evaluate the clustering algorithm.
    """

    logger.info('Constructing expected clusters and articles...')
    expected_clusters = {}
    articles = []
    all_files = []

    # Collect all appropriate files.
    for dir, subdir, files in os.walk('manage/evaluate/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
    for dir, name, filepath in all_files:
        category = dir.split('/')[-1]
        f = open(filepath, 'r')
        article = Article(
                text=f.read(),
                title=name.split('/')[-1]
        )
        expected_clusters.setdefault(category, []).append(article)
        articles.append(article)
        progress_bar(len(articles)/len(all_files) * 100)
    print('\n')

    logger.info('Will cluster {0} articles.'.format(len(articles)))
    logger.info('Expecting {0} clusters.'.format(len(expected_clusters.keys())))

    logger.info('Clustering...')
    p = cProfile.Profile()
    clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True)

    logger.info('Created {0} clusters.'.format(len(clusters)))

    logger.info('Cluster composition is as follows...')
    for c in clusters:
        logger.info([m.title for m in c.members])

    logger.info('Profiling statistics from the clustering...')
    ps = pstats.Stats(p)
    ps.sort_stats('time').print_stats(10)
Ejemplo n.º 27
0
def download(url, save_path, filename=None, progress=False):
    """
    Downloads a file from the specified URL.
    Will resume an existing download if the target
    server supports it (responds with the "Accepts-Range" header).

    Args:
        | url (str)       -- url of the file to download
        | save_path (str) -- path to the directory to save the file
        | progress (bool) -- output progress bar to stdout
    """

    # Strip trailing slash, if there is one.
    save_path = save_path.rstrip('\/')
    if filename is None:
        filename = url.split('/').pop()
    file = '{0}/{1}'.format(save_path, filename)

    existing_size = 0

    # If file already exists,
    # but there is not a newer file is on the server...
    if os.path.exists(file) and not _expired(url, file):
        # Append to existing file.
        outfile = open(file, 'ab')

        # Figure out how many bytes we've got.
        existing_size = outfile.tell()

        # Setup request for only the remaining bytes.
        headers = {'Range': 'bytes={0}-'.format(existing_size)}
        req = request.Request(url, headers=headers)

    # Otherwise, create a new/overwrite existing file.
    else:
        # Create/overwrite file.
        outfile = open(file, 'wb')
        outfile.seek(0)

        # Vanilla request.
        req = request.Request(url)

    try:
        # Get response.
        resp = request.urlopen(req)

        # Get total size of content.
        total_size = float(resp.headers['Content-Length'].strip())

        # Check if the file has already been downloaded_size.
        if total_size == existing_size:
            logger.info('File already downloaded.')
            return

        # Check that the server accepts ranges.
        # If it does not, the server will ignore the Range header,
        # And we have to start all over again.
        if existing_size > 0 and not resp.headers.get('Accept-Ranges', None):
            logger.info('Server does not allow resuming of downloads.')
            logger.info('Starting from the beginning! :D')
            outfile = open(file, 'wb')
            outfile.seek(0)

        if progress:
            progress_bar( (existing_size/total_size) * 100 )

        # Pull out the chunks!
        for chunk in iter(lambda: resp.read(CHUNK), b''):
            # Write the chunk to the file.
            outfile.write(chunk)

            # Update existing size.
            existing_size += len(chunk)

            percent_complete = (existing_size/total_size) * 100

            # Show progress.
            if progress:
                progress_bar(percent_complete)

        if progress:
            sys.stdout.write('\n')

        # Return the download's filepath.
        return file

    except request.HTTPError as e:
        logger.error('HTTP Error:', e.code, url)
    except request.URLError as e:
        logger.error('URL Error:', e.reason, url)