def testUploadCSV(self): path = '../data/BillGates.csv' upload_csv(path=path, limit=5) session = Session() messages = session.query(Document).all() session.close() self.assertLessEqual(len(messages), 5)
def corpus_update(id_): try: if flask_request.method == 'GET': form = CorpusForm() state, result = get_corpus(id_) session = Session() corpora = get_corpora(session) if not state: flask_session['status'] = dict(state=state, message=result) return redirect('/corpus') else: form.name.data = result.name if 'status' in flask_session and flask_session['status'] is not None: template = render_template( 'corpus.html', corpora=corpora, form=form, status=flask_session['status'], update_id=id_ ) else: template = render_template('corpus.html', corpora=corpora, form=form, update_id=id_) session.close() flask_session['status'] = None return template form = CorpusForm(flask_request.form) if form.validate(): state, message = update_corpus(id_, form.name.data) flask_session['status'] = dict(state=state, message=message) else: state, message = False, 'Form validation failed. Please check the name and try again!' flask_session['status'] = dict(state=state, message=message) return redirect('/corpus') except TemplateNotFound: abort(404)
def get_corpora(session=None): if session is None: session = Session() corpora = session.query(Corpus).all() if session is None: session.close() return corpora
def render_wordcloud(form, **kwargs): session = Session() results = search.search(session, **form.values()) # Create the corpus from the results tknzr = TweetTokenizer() texts = [] for r in results: tokens = [] for sent in sent_tokenize(r.text.strip()): tokens += [ w for w in tknzr.tokenize(sent.strip()) if w.lower() not in stopwords_en ] texts.append(tokens) corpus = nltk.TextCollection(texts) corpus.collocations(100) # noinspection PyProtectedMember results = { 'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)], 'collocations': corpus._collocations, } view = render_template('./templates/search/results_wordcloud.html', form=form, results=results, **kwargs) session.close() return view
def get_corpus(id_): session = Session() try: corpus = session.query(Corpus).filter(Corpus.id == id_).one() except MultipleResultsFound as _: return False, 'Multiple entries found! (Impossible)' except NoResultFound as _: return False, 'No such corpus exists. Please check the corpus ID.' finally: session.close() return True, corpus
def render_documents(form, **kwargs): session = Session() print(form.values(), kwargs) results = search.search(session, **form.values()) results, page, nb_pages = paginate(results, nb_pages=True, **form.values()) form.page.data = page view = render_template('./templates/search/results_documents.html', form=form, nb_pages=nb_pages, results=results, **kwargs) session.close() return view
def corpora_get(): try: if flask_request.method == 'GET': form = CorpusForm() session = Session() if 'status' in flask_session and flask_session['status'] is not None: template = render_template( 'corpus.html', corpora=get_corpora(session), form=form, status=flask_session['status'] ) else: template = render_template('corpus.html', corpora=get_corpora(session), form=form) session.close() flask_session['status'] = None return template form = CorpusForm(flask_request.form) if form.validate(): state, message = add_corpus(**form.values()) flask_session['status'] = dict(state=state, message=message) else: state, message = False, 'Form validation failed. Please check the name and try again!' flask_session['status'] = dict(state=state, message=message) session = Session() form = CorpusForm() template = render_template( 'corpus.html', corpora=get_corpora(session), form=form, status=flask_session['status'] ) session.close() flask_session['status'] = None return template except TemplateNotFound: abort(404)
def render_spacetime(form, **kwargs): session = Session() results = search.search(session, **form.values()) timeline = [] for hour, items in groupby( results, lambda x: [x.time.year, x.time.month, x.time.day, x.time.hour, 0, 0]): hour_start = time.mktime(datetime(*hour).timetuple()) hour_end = hour_start + 3600 timeline.append({ 'id': hour, 'x': [hour_start, hour_end], 'y': sum(1 for _ in items) }) location_features = {'type': 'FeatureCollection', 'features': []} idx = 0 for location, items in groupby( results, lambda x: x.location.display_name if x.location else None): if location is None or location == '': continue items = [i for i in items] if items[0].location.lat and items[0].location.lon: point = [items[0].location.lon, items[0].location.lat] location_features['features'].append({ "type": "Feature", "id": '{}'.format(idx), 'properties': { 'name': location, "density": len(items), }, 'geometry': { 'type': 'Point', 'coordinates': point, }, }) idx += 1 view = render_template('./templates/search/results_spacetime.html', form=form, **kwargs, timeline=timeline, location_features=location_features) session.close() return view
def add_corpus(name): session = Session() try: instance = session.query(Corpus).filter_by(name=name).first() if instance: return False, 'Corpus with that name exists. Please use unique names for corpus.' else: instance = Corpus(name=name) session.add(instance) session.commit() return True, 'Corpus added successfully!' except Exception as _: return False, 'Failed to add corpus due to a database error.' finally: session.close()
def delete_corpus(id_): session = Session() try: instance = session.query(Corpus).filter_by(id=id_).first() if instance: session.query(Corpus).filter_by(id=id_).delete() session.commit() return True, 'Corpus deleted successfully!' else: return False, 'Invalid corpus name' except IntegrityError as _: return False, 'Failed to delete corpus due to an integrity error. ' \ 'There may be messages associated with this corpus.' except Exception as _: return False, 'Failed to delete corpus due to a database error.' finally: session.close()
def update_corpus(id_, name): session = Session() try: check_name = session.query(Corpus).filter_by(name=name).first() if check_name: return False, 'Corpus with that name exists. Please use unique names for corpus.' else: instance = session.query(Corpus).filter_by(id=id_).first() if instance: instance.name = name session.commit() return True, 'Corpus updated successfully!' else: return False, 'No such corpus exists. Please check the corpus ID.' except Exception as _: return False, 'Failed to update corpus due to a database error.' finally: session.close()
def store_tweets(tweets, featurizers, corpus='default'): session = Session() corpus = session.get_or_create(Corpus, name=corpus) twitter = session.get_or_create(Source, name='com.twitter') nb_tweets, nb_records, nb_errors, errors = len(tweets), 0, 0, [] for tweet in tweets: # noinspection PyProtectedMember record = tweet._json try: if 'text' not in record: continue record['text'] = record['text'].encode('utf8').decode('utf8') message = session.get_or_create(Document, source=twitter, id=record['id_str']) message.text = record['text'] if 'in_reply_to_status_id' in record: parent_tweet_id = record['in_reply_to_status_id'] if parent_tweet_id is not None: parent = session.get_or_create(Document, source=twitter, id=parent_tweet_id) message.parent_id = parent.id if 'location' in record and record['location'] != '': location_str = record['location'].replace(',', ' ').strip() location = session.get_or_create(Location, location=location_str) if location.display_name is None: location_info = get_location_info(location_str) if location_info is None: location.display_name = location_str else: if 'display_name' in location_info: location.display_name = location_info['display_name'] location.address = location_info['address'] if 'address' in location_info else None location.geojson = location_info['geojson'] if 'geojson' in location_info else None location.lat = location_info['lat'] if 'lat' in location_info else None location.lon = location_info['lon'] if 'lon' in location_info else None message.location = location if 'created_at' in record: message.time = datetime.strptime(record['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if 'user' in record: author = session.get_or_create(Author, source=twitter, id=record['user']['screen_name']) author.name = record['user']['name'] message.author_id = author.id if 'lang' in record: message.lang = record['lang'] message.features = dict() for featurizer in featurizers: for k, v in featurizer(message).items(): message.features[k] = v corpus.documents.append(message) session.commit() except (OperationalError, DataError) as err: session.rollback() errors += [err] nb_errors += 1 print(progress(nb_records, nb_tweets, 'Uploading Progress'), end='', flush=True) nb_records += 1 print('\nNumber of Errors: {}'.format(nb_errors)) session.close()
def render_statistics(form, **kwargs): session = Session() results = search.search(session, **form.values()) # Create the corpus from the results tknzr = TweetTokenizer() texts = [] for r in results: tokens = [] for sent in sent_tokenize(r.text.strip()): tokens += tknzr.tokenize(sent.strip()) texts.append(tokens) feature_summary = dict() feature_map = defaultdict(lambda: []) for r in results: for f in r.features: feature_map[f].append(r.features[f]) # normalized_feature_map = feature_map summary = [] # nfm = quantile_normalize(feature_map) nfm = rescale(feature_map) # nfm = feature_map for fname, data in nfm.items(): hist, bin_edges = np.histogram(data, 20) hist, bin_edges = hist.tolist(), bin_edges.tolist() bins = [ dict(x0=x, x1=y, length=z) for x, y, z in zip(bin_edges[:-1], bin_edges[1:], hist) ] feature_summary[fname] = dict(bins=bins) quartiles = np.percentile(data, [25, 50, 75]) summary += [ dict(label=fname, values=dict(Q1=quartiles[0], Q2=quartiles[1], Q3=quartiles[2], min=min(data), max=max(data), outliers=[])) ] del feature_map corpus = nltk.TextCollection(texts) corpus.collocations(100) collocations = [] # noinspection PyProtectedMember for collocation in corpus._collocations: temp = defaultdict(lambda: []) for r in results: if ' '.join(collocation) in r.text: for f in r.features: temp[f].append(r.features[f]) for k, v in temp.items(): temp[k] = np.average(v) collocations += [(collocation, dict(temp))] results = { 'vocabulary': corpus.vocab().most_common(100), 'collocations': collocations, 'features': feature_summary, 'summary': summary } view = render_template('./templates/search/results_statistics.html', form=form, results=results, **kwargs) session.close() return view