Ejemplo n.º 1
0
def evaluate_clustering():
    """
    Evaluate the clustering algorithm.
    """

    logger.info('Constructing expected clusters and articles...')
    expected_clusters = {}
    articles = []
    all_files = []

    # Collect all appropriate files.
    for dir, subdir, files in os.walk('manage/evaluate/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
    for dir, name, filepath in all_files:
        category = dir.split('/')[-1]
        f = open(filepath, 'r')
        article = Article(
                text=f.read(),
                title=name.split('/')[-1]
        )
        expected_clusters.setdefault(category, []).append(article)
        articles.append(article)
        progress_bar(len(articles)/len(all_files) * 100)
    print('\n')

    logger.info('Will cluster {0} articles.'.format(len(articles)))
    logger.info('Expecting {0} clusters.'.format(len(expected_clusters.keys())))

    logger.info('Clustering...')
    p = cProfile.Profile()
    clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True)

    logger.info('Created {0} clusters.'.format(len(clusters)))

    logger.info('Cluster composition is as follows...')
    for c in clusters:
        logger.info([m.title for m in c.members])

    logger.info('Profiling statistics from the clustering...')
    ps = pstats.Stats(p)
    ps.sort_stats('time').print_stats(10)
Ejemplo n.º 2
0
def evaluate_clustering():
    """
    Evaluate the clustering algorithm.
    """

    logger.info('Constructing expected clusters and articles...')
    expected_clusters = {}
    articles = []
    all_files = []

    # Collect all appropriate files.
    for dir, subdir, files in os.walk('manage/evaluate/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
    for dir, name, filepath in all_files:
        category = dir.split('/')[-1]
        f = open(filepath, 'r')
        article = Article(text=f.read(), title=name.split('/')[-1])
        expected_clusters.setdefault(category, []).append(article)
        articles.append(article)
        progress_bar(len(articles) / len(all_files) * 100)
    print('\n')

    logger.info('Will cluster {0} articles.'.format(len(articles)))
    logger.info('Expecting {0} clusters.'.format(len(
        expected_clusters.keys())))

    logger.info('Clustering...')
    p = cProfile.Profile()
    clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True)

    logger.info('Created {0} clusters.'.format(len(clusters)))

    logger.info('Cluster composition is as follows...')
    for c in clusters:
        logger.info([m.title for m in c.members])

    logger.info('Profiling statistics from the clustering...')
    ps = pstats.Stats(p)
    ps.sort_stats('time').print_stats(10)
Ejemplo n.º 3
0
def generate(keywords, num=5000):
    this_dir = path.dirname(__file__)
    articles = load_articles()

    # Filter down to articles from the specified sources.
    results = []
    articles = articles[:num]
    for idx, article in enumerate(articles):
        article_words = tokenize(article['text'])
        if set(article_words).issuperset(set(keywords)):
            results.append(article)
        progress_bar(idx / len(articles) * 100)

    # Store articles into separate text files.
    for article in results:
        #print(json.dumps(article, sort_keys=True, indent=4))
        article_path = 'unorganized_articles/{0}.txt'.format(article['title'])
        f = open(path.join(this_dir, article_path), 'wb')
        f.write(article['text'].encode('utf-8'))
Ejemplo n.º 4
0
def generate(keywords, num=5000):
    this_dir = path.dirname(__file__)
    articles = load_articles()

    # Filter down to articles from the specified sources.
    results = []
    articles = articles[:num]
    for idx, article in enumerate(articles):
        article_words = tokenize(article['text'])
        if set(article_words).issuperset(set(keywords)):
            results.append(article)
        progress_bar(idx/len(articles) * 100)

    # Store articles into separate text files.
    for article in results:
        #print(json.dumps(article, sort_keys=True, indent=4))
        article_path = 'unorganized_articles/{0}.txt'.format(article['title'])
        f = open(path.join(this_dir, article_path), 'wb')
        f.write(article['text'].encode('utf-8'))
Ejemplo n.º 5
0
def seed(debug=False):
    this_dir = os.path.dirname(__file__)
    seeds = open(os.path.join(this_dir, 'seed.json'), 'r')
    sources = open(os.path.join(this_dir, 'seed_sources.json'), 'r')

    sample_images = [
        'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg'
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg'
    ]

    print('Resetting the database...')
    db.drop_all()
    db.create_all()

    # Create sources
    print('Creating sources...')
    for url in json.load(sources):
        s = Source(ext_url=url, name='The Times') # fake name
        db.session.add(s)
    db.session.commit()
    num_sources = Source.query.count()
    print('Created {0} sources.'.format(num_sources))

    # Create articles
    entries = json.load(seeds)
    print('Seeding {0} articles...'.format(len(entries)))
    articles = []
    for entry in entries:
        if debug:
            print(json.dumps(entry, sort_keys=True, indent=4))

        source = Source.query.filter_by(ext_url=entry['source']).first()

        a = Article(
                ext_url=entry['url'],
                source=source,
                html=entry['html'],
                text=entry['text'],
                tags=entry['tags'],
                title=entry['title'],
                created_at = parse(entry['published']),
                updated_at = parse(entry['updated']),
                image=random.choice(sample_images) # fake image
        )
        articles.append(a)
        db.session.add(a)

        progress_bar(len(articles) / len(entries) * 100)

    db.session.commit()

    num_articles = Article.query.count()
    num_entities = Entity.query.count()
    print('Seeded {0} articles.'.format(num_articles))
    print('Found {0} entities.'.format(num_entities))

    print('Clustering articles into events...')
    Event.cluster(articles, threshold=0.02, debug=True)
    num_events = Event.query.count()
    print('Created {0} event clusters.'.format(num_events))

    print('Clustering events into stories...')
    events = Event.query.all()
    Story.cluster(events, threshold=0.02, debug=True)
    num_stories = Story.query.count()
    print('Created {0} story clusters.'.format(num_stories))

    print('\n\n==============================================')
    print('From {0} sources, seeded {1} articles, found {2} entities, created {3} events and {4} stories.'.format(num_sources, num_articles, num_entities, num_events, num_stories))
    print('==============================================\n\n')
Ejemplo n.º 6
0
def seed(debug=False):
    # Patch out saving images to S3.
    patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg')
    patcher.start()

    seeds = open('manage/core/data/seed.json', 'r')

    sample_images = [
        'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg'
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Saint_Isaac%27s_Cathedral_in_SPB.jpeg/800px-Saint_Isaac%27s_Cathedral_in_SPB.jpeg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg/800px-Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e9/TU_Bibl_01_DSC1099w.jpg/644px-TU_Bibl_01_DSC1099w.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg/800px-%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/6/64/Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG/800px-Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Autignac%2C_H%C3%A9rault_01.jpg/800px-Autignac%2C_H%C3%A9rault_01.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg/800px-Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg/800px-Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/b/bd/12-07-12-wikimania-wdc-by-RalfR-010.jpg/800px-12-07-12-wikimania-wdc-by-RalfR-010.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg/800px-Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg/611px-British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Mercedes-Benz_Museum_201312_08_blue_hour.jpg/800px-Mercedes-Benz_Museum_201312_08_blue_hour.jpg'
    ]

    print('Resetting the database...')
    db.reflect()
    db.drop_all()
    db.create_all()

    # Create sources
    print('Creating sources...')
    create_sources()
    num_sources = Source.query.count()
    print('Created {0} sources.'.format(num_sources))

    # Create articles
    entries = json.load(seeds)
    print('Seeding {0} articles...'.format(len(entries)))
    articles = []
    for entry in entries:
        if debug:
            print(json.dumps(entry, sort_keys=True, indent=4))

        feed = Feed.query.filter_by(ext_url=entry['source']).first()

        a = Article(
                ext_url=entry['url'],
                feed=feed,
                source=feed.source,
                html=entry['html'],
                text=entry['text'],
                tags=entry['tags'],
                title=entry['title'],
                created_at = parse(entry['published']),
                updated_at = parse(entry['updated']),
                image=random.choice(sample_images), # fake image
                score=random.random() * 100         # fake score
        )
        articles.append(a)
        db.session.add(a)

        progress_bar(len(articles) / len(entries) * 100)

    print('Creating additional articles...')

    # Collect all appropriate files.
    all_files = []
    for dir, subdir, files in os.walk('manage/core/data/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
    for dir, name, filepath in all_files:
        category = dir.split('/')[-1]
        f = open(filepath, 'r')
        article = Article(
                text=f.read(),
                title=name.split('/')[-1],
                ext_url='http://fauxurl/',
                source = Source.query.get(1),       # fake source
                image=random.choice(sample_images), # fake image
                score=random.random() * 100         # fake score
        )
        db.session.add(article)
        articles.append(article)
        progress_bar(len(articles)/len(all_files) * 100)

    db.session.commit()

    num_articles = Article.query.count()
    num_concepts = Concept.query.count()
    print('Seeded {0} articles.'.format(num_articles))
    print('Found {0} concepts.'.format(num_concepts))

    print('Clustering articles into events...')
    cluster.cluster(articles)
    num_events = Event.query.count()
    print('Created {0} event clusters.'.format(num_events))

    print('Clustering events into stories...')
    # TO DO
    num_stories = Story.query.count()
    print('Created {0} story clusters.'.format(num_stories))

    patcher.stop()

    print('\n\n==============================================')
    print('From {0} sources, seeded {1} articles, found {2} concepts, created {3} events and {4} stories.'.format(num_sources, num_articles, num_concepts, num_events, num_stories))
    print('==============================================\n\n')

    client = current_app.test_client()
    ctx = current_app.test_request_context()
    ctx.push()
    register_user(email='[email protected]', password='******')
    ctx.pop()
    print('\n\n==============================================')
    print('Created a test user, email is [email protected], password is password')
    print('==============================================\n\n')

    client = Client(
        #client_id=gen_salt(40),
        #client_secret=gen_salt(50),
        client_id='test',
        client_secret='test',
        _redirect_uris='http://localhost:5000/authorized',
        _default_scopes='userinfo',
        _allowed_grant_types='authorization_code refresh_token password',
        user_id=None,
        is_confidential=True # make a confidential client.
    )
    db.session.add(client)
    db.session.commit()
    print('\n\n==============================================')
    print('Created a test client:\nclient id: {0}\nclient secret: {1}'.format(client.client_id, client.client_secret))
    print('==============================================\n\n')
Ejemplo n.º 7
0
def download(url, save_path, filename=None, progress=False):
    """
    Downloads a file from the specified URL.
    Will resume an existing download if the target
    server supports it (responds with the "Accepts-Range" header).

    Args:
        | url (str)       -- url of the file to download
        | save_path (str) -- path to the directory to save the file
        | progress (bool) -- output progress bar to stdout
    """

    # Strip trailing slash, if there is one.
    save_path = save_path.rstrip('\/')
    if filename is None:
        filename = url.split('/').pop()
    file = '{0}/{1}'.format(save_path, filename)

    existing_size = 0

    # If file already exists,
    # but there is not a newer file is on the server...
    if os.path.exists(file) and not _expired(url, file):
        # Append to existing file.
        outfile = open(file, 'ab')

        # Figure out how many bytes we've got.
        existing_size = outfile.tell()

        # Setup request for only the remaining bytes.
        headers = {'Range': 'bytes={0}-'.format(existing_size)}
        req = request.Request(url, headers=headers)

    # Otherwise, create a new/overwrite existing file.
    else:
        # Create/overwrite file.
        outfile = open(file, 'wb')
        outfile.seek(0)

        # Vanilla request.
        req = request.Request(url)

    try:
        # Get response.
        resp = request.urlopen(req)

        # Get total size of content.
        total_size = float(resp.headers['Content-Length'].strip())

        # Check if the file has already been downloaded_size.
        if total_size == existing_size:
            logger.info('File already downloaded.')
            return

        # Check that the server accepts ranges.
        # If it does not, the server will ignore the Range header,
        # And we have to start all over again.
        if existing_size > 0 and not resp.headers.get('Accept-Ranges', None):
            logger.info('Server does not allow resuming of downloads.')
            logger.info('Starting from the beginning! :D')
            outfile = open(file, 'wb')
            outfile.seek(0)

        if progress:
            progress_bar( (existing_size/total_size) * 100 )

        # Pull out the chunks!
        for chunk in iter(lambda: resp.read(CHUNK), b''):
            # Write the chunk to the file.
            outfile.write(chunk)

            # Update existing size.
            existing_size += len(chunk)

            percent_complete = (existing_size/total_size) * 100

            # Show progress.
            if progress:
                progress_bar(percent_complete)

        if progress:
            sys.stdout.write('\n')

        # Return the download's filepath.
        return file

    except request.HTTPError as e:
        logger.error('HTTP Error:', e.code, url)
    except request.URLError as e:
        logger.error('URL Error:', e.reason, url)
Ejemplo n.º 8
0
    def run(self, dumppath, use_patch):
        if use_patch:
            print('Patching out saving images to S3...')
            patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg')
            patcher.start()
        else:
            patcher = None

        print('Loading sources...')
        sources_map = {}
        with open(os.path.join(dumppath, 'sources.json'), 'r') as f:
            sources = json.load(f)
            for i, s in enumerate(sources):
                source = Source.query.filter(Source.name == s['name']).first()
                if not source:
                    source = Source(name=s['name'])
                    db.session.add(source)
                id = s['_id']['$oid']
                sources_map[id] = source

                progress_bar(i/(len(sources) - 1) * 100)

        db.session.commit()

        print('\nLoading feeds...')
        feeds_map = {}
        with open(os.path.join(dumppath, 'feeds.json'), 'r') as f:
            feeds = json.load(f)
            for i, f in enumerate(feeds):
                feed = Feed.query.filter(Feed.ext_url == f['ext_url']).first()
                if not feed:
                    feed = Feed(ext_url=f['ext_url'])
                    db.session.add(feed)
                feed.source = sources_map[f['source']['$oid']]

                id = f['_id']['$oid']
                feeds_map[id] = feed

                progress_bar(i/(len(feeds) - 1) * 100)

        db.session.commit()

        print('\nLoading articles...')
        with open(os.path.join(dumppath, 'articles.json'), 'r') as f:
            articles = json.load(f)
            for i, a in enumerate(articles):

                authors = []
                for author in a['authors']:
                    authors.append(Author.find_or_create(name=author))

                existing = Article.query.filter(Article.ext_url == a['ext_url']).first()

                if not existing:
                    feed = feeds_map[a['feed']['$oid']]
                    article = Article(
                        ext_url=a['ext_url'],
                        source=feed.source,
                        feed=feed,
                        html=None,    # not saved by argos.corpora
                        text=fix_text_segment(a['text']),
                        authors=authors,
                        tags=[],
                        title=fix_text_segment(a['title']),
                        created_at=datetime.fromtimestamp(a['created_at']['$date']/1000),
                        updated_at=datetime.fromtimestamp(a['updated_at']['$date']/1000),
                        image=a['image'],
                        score=evaluator.score(a['ext_url'])
                    )
                    db.session.add(article)
                progress_bar(i/(len(articles) - 1) * 100)

        print('Loaded {0} sources, {1} feeds, and {2} articles.'.format(len(sources), len(feeds), len(articles)))
        print('Done!')

        if patcher is not None:
            patcher.stop()