def testUploadCSV(self):
     path = '../data/BillGates.csv'
     upload_csv(path=path, limit=5)
     session = Session()
     messages = session.query(Document).all()
     session.close()
     self.assertLessEqual(len(messages), 5)
Beispiel #2
0
def corpus_update(id_):
    try:
        if flask_request.method == 'GET':
            form = CorpusForm()
            state, result = get_corpus(id_)
            session = Session()
            corpora = get_corpora(session)
            if not state:
                flask_session['status'] = dict(state=state, message=result)
                return redirect('/corpus')
            else:
                form.name.data = result.name
            if 'status' in flask_session and flask_session['status'] is not None:
                template = render_template(
                    'corpus.html', corpora=corpora, form=form, status=flask_session['status'], update_id=id_
                )
            else:
                template = render_template('corpus.html', corpora=corpora, form=form, update_id=id_)
            session.close()
            flask_session['status'] = None
            return template
        form = CorpusForm(flask_request.form)
        if form.validate():
            state, message = update_corpus(id_, form.name.data)
            flask_session['status'] = dict(state=state, message=message)
        else:
            state, message = False, 'Form validation failed. Please check the name and try again!'
            flask_session['status'] = dict(state=state, message=message)
        return redirect('/corpus')
    except TemplateNotFound:
        abort(404)
Beispiel #3
0
def get_corpora(session=None):
    if session is None:
        session = Session()
    corpora = session.query(Corpus).all()
    if session is None:
        session.close()
    return corpora
Beispiel #4
0
def render_wordcloud(form, **kwargs):
    session = Session()
    results = search.search(session, **form.values())
    # Create the corpus from the results
    tknzr = TweetTokenizer()
    texts = []
    for r in results:
        tokens = []
        for sent in sent_tokenize(r.text.strip()):
            tokens += [
                w for w in tknzr.tokenize(sent.strip())
                if w.lower() not in stopwords_en
            ]
        texts.append(tokens)
    corpus = nltk.TextCollection(texts)
    corpus.collocations(100)
    # noinspection PyProtectedMember
    results = {
        'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)],
        'collocations': corpus._collocations,
    }
    view = render_template('./templates/search/results_wordcloud.html',
                           form=form,
                           results=results,
                           **kwargs)
    session.close()
    return view
Beispiel #5
0
def get_corpus(id_):
    session = Session()
    try:
        corpus = session.query(Corpus).filter(Corpus.id == id_).one()
    except MultipleResultsFound as _:
        return False, 'Multiple entries found! (Impossible)'
    except NoResultFound as _:
        return False, 'No such corpus exists. Please check the corpus ID.'
    finally:
        session.close()
    return True, corpus
Beispiel #6
0
def render_documents(form, **kwargs):
    session = Session()
    print(form.values(), kwargs)
    results = search.search(session, **form.values())
    results, page, nb_pages = paginate(results, nb_pages=True, **form.values())
    form.page.data = page
    view = render_template('./templates/search/results_documents.html',
                           form=form,
                           nb_pages=nb_pages,
                           results=results,
                           **kwargs)
    session.close()
    return view
Beispiel #7
0
def corpora_get():
    try:
        if flask_request.method == 'GET':
            form = CorpusForm()
            session = Session()
            if 'status' in flask_session and flask_session['status'] is not None:
                template = render_template(
                    'corpus.html', corpora=get_corpora(session), form=form,
                    status=flask_session['status']
                )
            else:
                template = render_template('corpus.html', corpora=get_corpora(session), form=form)
            session.close()
            flask_session['status'] = None
            return template
        form = CorpusForm(flask_request.form)
        if form.validate():
            state, message = add_corpus(**form.values())
            flask_session['status'] = dict(state=state, message=message)
        else:
            state, message = False, 'Form validation failed. Please check the name and try again!'
            flask_session['status'] = dict(state=state, message=message)
        session = Session()
        form = CorpusForm()
        template = render_template(
            'corpus.html', corpora=get_corpora(session), form=form,
            status=flask_session['status']
        )
        session.close()
        flask_session['status'] = None
        return template
    except TemplateNotFound:
        abort(404)
Beispiel #8
0
def render_spacetime(form, **kwargs):
    session = Session()
    results = search.search(session, **form.values())
    timeline = []
    for hour, items in groupby(
            results, lambda x:
        [x.time.year, x.time.month, x.time.day, x.time.hour, 0, 0]):
        hour_start = time.mktime(datetime(*hour).timetuple())
        hour_end = hour_start + 3600
        timeline.append({
            'id': hour,
            'x': [hour_start, hour_end],
            'y': sum(1 for _ in items)
        })
    location_features = {'type': 'FeatureCollection', 'features': []}
    idx = 0
    for location, items in groupby(
            results, lambda x: x.location.display_name
            if x.location else None):
        if location is None or location == '':
            continue
        items = [i for i in items]
        if items[0].location.lat and items[0].location.lon:
            point = [items[0].location.lon, items[0].location.lat]
            location_features['features'].append({
                "type": "Feature",
                "id": '{}'.format(idx),
                'properties': {
                    'name': location,
                    "density": len(items),
                },
                'geometry': {
                    'type': 'Point',
                    'coordinates': point,
                },
            })
        idx += 1
    view = render_template('./templates/search/results_spacetime.html',
                           form=form,
                           **kwargs,
                           timeline=timeline,
                           location_features=location_features)
    session.close()
    return view
Beispiel #9
0
def add_corpus(name):
    session = Session()
    try:
        instance = session.query(Corpus).filter_by(name=name).first()
        if instance:
            return False, 'Corpus with that name exists. Please use unique names for corpus.'
        else:
            instance = Corpus(name=name)
            session.add(instance)
            session.commit()
            return True, 'Corpus added successfully!'
    except Exception as _:
        return False, 'Failed to add corpus due to a database error.'
    finally:
        session.close()
Beispiel #10
0
def delete_corpus(id_):
    session = Session()
    try:
        instance = session.query(Corpus).filter_by(id=id_).first()
        if instance:
            session.query(Corpus).filter_by(id=id_).delete()
            session.commit()
            return True, 'Corpus deleted successfully!'
        else:
            return False, 'Invalid corpus name'
    except IntegrityError as _:
        return False, 'Failed to delete corpus due to an integrity error. ' \
                      'There may be messages associated with this corpus.'
    except Exception as _:
        return False, 'Failed to delete corpus due to a database error.'
    finally:
        session.close()
Beispiel #11
0
def update_corpus(id_, name):
    session = Session()
    try:
        check_name = session.query(Corpus).filter_by(name=name).first()
        if check_name:
            return False, 'Corpus with that name exists. Please use unique names for corpus.'
        else:
            instance = session.query(Corpus).filter_by(id=id_).first()
            if instance:
                instance.name = name
                session.commit()
                return True, 'Corpus updated successfully!'
            else:
                return False, 'No such corpus exists. Please check the corpus ID.'
    except Exception as _:
        return False, 'Failed to update corpus due to a database error.'
    finally:
        session.close()
Beispiel #12
0
def store_tweets(tweets, featurizers, corpus='default'):
    session = Session()
    corpus = session.get_or_create(Corpus, name=corpus)
    twitter = session.get_or_create(Source, name='com.twitter')
    nb_tweets, nb_records, nb_errors, errors = len(tweets), 0, 0, []
    for tweet in tweets:
        # noinspection PyProtectedMember
        record = tweet._json
        try:
            if 'text' not in record:
                continue
            record['text'] = record['text'].encode('utf8').decode('utf8')
            message = session.get_or_create(Document, source=twitter, id=record['id_str'])
            message.text = record['text']
            if 'in_reply_to_status_id' in record:
                parent_tweet_id = record['in_reply_to_status_id']
                if parent_tweet_id is not None:
                    parent = session.get_or_create(Document, source=twitter, id=parent_tweet_id)
                    message.parent_id = parent.id
            if 'location' in record and record['location'] != '':
                location_str = record['location'].replace(',', ' ').strip()
                location = session.get_or_create(Location, location=location_str)
                if location.display_name is None:
                    location_info = get_location_info(location_str)
                    if location_info is None:
                        location.display_name = location_str
                    else:
                        if 'display_name' in location_info:
                            location.display_name = location_info['display_name']
                        location.address = location_info['address'] if 'address' in location_info else None
                        location.geojson = location_info['geojson'] if 'geojson' in location_info else None
                        location.lat = location_info['lat'] if 'lat' in location_info else None
                        location.lon = location_info['lon'] if 'lon' in location_info else None
                message.location = location
            if 'created_at' in record:
                message.time = datetime.strptime(record['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
            if 'user' in record:
                author = session.get_or_create(Author, source=twitter, id=record['user']['screen_name'])
                author.name = record['user']['name']
                message.author_id = author.id
            if 'lang' in record:
                message.lang = record['lang']
            message.features = dict()
            for featurizer in featurizers:
                for k, v in featurizer(message).items():
                    message.features[k] = v
            corpus.documents.append(message)
            session.commit()
        except (OperationalError, DataError) as err:
            session.rollback()
            errors += [err]
            nb_errors += 1
        print(progress(nb_records, nb_tweets, 'Uploading Progress'), end='', flush=True)
        nb_records += 1
    print('\nNumber of Errors: {}'.format(nb_errors))
    session.close()
Beispiel #13
0
def render_statistics(form, **kwargs):
    session = Session()
    results = search.search(session, **form.values())
    # Create the corpus from the results
    tknzr = TweetTokenizer()
    texts = []
    for r in results:
        tokens = []
        for sent in sent_tokenize(r.text.strip()):
            tokens += tknzr.tokenize(sent.strip())
        texts.append(tokens)
    feature_summary = dict()
    feature_map = defaultdict(lambda: [])
    for r in results:
        for f in r.features:
            feature_map[f].append(r.features[f])
    # normalized_feature_map = feature_map
    summary = []
    # nfm = quantile_normalize(feature_map)
    nfm = rescale(feature_map)
    # nfm = feature_map
    for fname, data in nfm.items():
        hist, bin_edges = np.histogram(data, 20)
        hist, bin_edges = hist.tolist(), bin_edges.tolist()
        bins = [
            dict(x0=x, x1=y, length=z)
            for x, y, z in zip(bin_edges[:-1], bin_edges[1:], hist)
        ]
        feature_summary[fname] = dict(bins=bins)
        quartiles = np.percentile(data, [25, 50, 75])
        summary += [
            dict(label=fname,
                 values=dict(Q1=quartiles[0],
                             Q2=quartiles[1],
                             Q3=quartiles[2],
                             min=min(data),
                             max=max(data),
                             outliers=[]))
        ]
    del feature_map
    corpus = nltk.TextCollection(texts)
    corpus.collocations(100)
    collocations = []
    # noinspection PyProtectedMember
    for collocation in corpus._collocations:
        temp = defaultdict(lambda: [])
        for r in results:
            if ' '.join(collocation) in r.text:
                for f in r.features:
                    temp[f].append(r.features[f])
        for k, v in temp.items():
            temp[k] = np.average(v)
        collocations += [(collocation, dict(temp))]
    results = {
        'vocabulary': corpus.vocab().most_common(100),
        'collocations': collocations,
        'features': feature_summary,
        'summary': summary
    }
    view = render_template('./templates/search/results_statistics.html',
                           form=form,
                           results=results,
                           **kwargs)
    session.close()
    return view