def get_content(DOI, refresh=True, *args, **kwds): """ Helper function to read file content as xml. Args: input_doi (str): DOI of article *args: **kwds: Returns: Content of returned XML file """ if not refresh: db = open_db_connection() elsevier = db.elsevier entries = elsevier.find({"doi": DOI}) if len(entries): if len(entries) > 1: print( "More than one entry for given DOI! Only using only first entry." ) entry = entries[0] if entry["collected"]: content = entry["xml"] return content content = download(*args, **kwds).text return content
def search_for_material(material, search): db = open_db_connection() if search: results = db.abstracts.find({"$text": {"$search": search}, "chem_mentions.names": material}, ["year"]) else: results = db.abstracts.find({"chem_mentions.names": material}, ["year"]) return list(results)
def search_for_topic(search): db = open_db_connection(db="matstract_db") if search: results = db.abstracts.find({"$or": [{"title": {"$regex": ".*{}.*".format(search)}}, {"abstract": {"$regex": ".*{}.*".format(search)}}]}, ["year"]) print(results.count()) return list(results) else: return []
def get_keywords(material): db = open_db_connection(db="tri_abstracts") print(db.info) parser = SimpleParser() material = parser.matgen_parser(material) print("number of materials is", db.keywords.count()) keywords = db.keywords.find_one({'material': material}) if keywords is not None: tf = keywords['keywords_tf'] tf_arranged = arrange_keywords(tf) tfidf = keywords['keywords_tfidf'] tfidf_arranged = arrange_keywords(tfidf) df = pd.DataFrame() df['tf'] = tf_arranged df['tfidf'] = tfidf_arranged return generate_table(df) else: return "No keywords for the specified material"
def check_scopus_collection(year, issn): """ Checks the scopus_log collection on MongoDB whether the data for a given year/journal combination has been collected. Args: year: (str) year issn: (str) issn of journal Returns: (bool) True if status of the year/journal pair is "complete" """ db = open_db_connection() log = db.elsevier_log entry = log.find({"year": year, "issn": issn})[0] if entry["status"] == "complete": return True elif entry["status"] == "incomplete": return False else: raise KeyError("Entry has no status!")
import dash_html_components as html import dash_core_components as dcc import pandas as pd import random from matstract.utils import open_db_connection, open_es_client from matstract.extract import parsing from bson import ObjectId db = open_db_connection(db="matstract_db") client = open_es_client() def random_abstract(): random_document = list(db.abstracts.aggregate([{ "$sample": { "size": 1 } }]))[0] return random_document['abstract'] def sort_results(results, ids): results_sorted = sorted(results, key=lambda k: ids.index(k['_id'])) return results_sorted def highlight_material(body, material): highlighted_phrase = html.Mark(material) if len(material) > 0 and material in body: chopped = body.split(material) newtext = []
import dash_html_components as html import dash_materialsintelligence as dmi from flask_caching import Cache from flask import send_from_directory from matstract.web.view import annotate_app, similar_app, \ search_app, keyword_app, extract_app, trends_app from dash.dependencies import Input, Output, State from matstract.web.callbacks import search_callbacks, annotate_callbacks, \ extract_callbacks, keyword_callbacks, trends_callbacks, similar_callbacks from matstract.utils import open_db_connection import os db = open_db_connection(local=True) app = dash.Dash() server = app.server # To include local css and js files app.css.config.serve_locally = True app.scripts.config.serve_locally = True app.config.suppress_callback_exceptions = True app.title = "Matstract" cache = Cache(server, config={"CACHE_TYPE": "simple"}) ### CSS settings ### BACKGROUND = 'rgb(230, 230, 230)' COLORSCALE = [[0, "rgb(244,236,21)"], [0.3, "rgb(249,210,41)"], [0.4, "rgb(134,191,118)"],
from dash.dependencies import Input, Output, State import dash_html_components as html from matstract.web.view import extract_app from matstract.utils import open_db_connection db = open_db_connection(local=True, db="matstract_db") def bind(app): ### Extract App Callbacks ### @app.callback(Output('extract-highlighted', 'children'), [Input('extract-button', 'n_clicks')], [State('extract-textarea', 'value')]) def highlight_extracted(n_clicks, text): if n_clicks is not None: text, tags = extract_app.extract_ne(text) spaced_tags = [] for tag in tags: #spaced_tags += [tag, html.Span()] span = html.Span(tag) span.style = {'padding-right': '15px'} spaced_tags.append(span) return html.Div(text), html.Br(), html.Div( html.Label('Extracted Entity tags:')), html.Div(spaced_tags) @app.callback( Output('extract-textarea', 'value'), # Output('similar-textarea', 'value'), [Input("extract-random", 'n_clicks')]) def get_random(n_clicks): if n_clicks is not None:
from dash.dependencies import Input, Output, State from matstract.web.view import similar_app from matstract.utils import open_db_connection db = open_db_connection(db="matstract_db", local=True) def bind(app, cache): @app.callback( Output('similar-textarea', 'value'), # Output('similar-textarea', 'value'), [Input("similar-random", 'n_clicks')]) def get_random(n_clicks): if n_clicks is not None: text = similar_app.random_abstract() return text return "" @cache.memoize(timeout=600) @app.callback( Output('similar-table', 'children'), # [Input('search-box', 'value')]) [Input('similar-button', 'n_clicks')], [State('similar-textarea', 'value'), State('similar-material-box', 'value')]) def update_table(n_clicks, search, material): if material is not None: table = similar_app.generate_table(search, material) else: table = similar_app.generate_table(search) return table
def __init__(self, local=False): self._db = open_db_connection(access="annotator", local=local)
return kwds_tf def extract_tfidf(list_of_abstracts, db_l, count=5): kwds_tfidf = {} for tt in ['unigrams', 'bigrams', 'trigrams']: tf = TermFrequency(normalize=True, first_last_sentence_only=True, token_type=tt) tf.fit_tf(list_of_abstracts) most_common = tf.sorted_frequencies[:20] idf_scores = [(word, idf_mongo(db_l, word) * score) for (word, score) in most_common] top_idf = sorted(idf_scores, key=itemgetter(1), reverse=True)[:count] kwds_tfidf[tt] = top_idf return kwds_tfidf if __name__ == '__main__': db = open_db_connection() material = 'GaN' result = get_search_results(material=material) abstracts = [doc['abstract'] for doc in result] # Extract term frequencies term_frequencies = extract_tf(abstracts, count=5) # Extract tfidf tfidf = extract_tfidf(abstracts, db, count=5) for n_grams in ['unigrams', 'bigrams', 'trigrams']: print('####', n_grams, '####', sep='\n') for tf, tf_idf in zip(term_frequencies[n_grams], tfidf[n_grams]): print(tf, tf_idf)
def contribute(user_creds="matstract/atlas_creds.json", max_block_size=100, num_blocks=1): """ Gets a incomplete year/journal combination from elsevier_log, queries for the corresponding dois, and downloads the corresponding xmls for each to the elsevier collection. Args: user_creds ((:obj:`str`, optional)): path to contributing user's write-permitted credential file. max_block_size ((:obj:`int`, optional)): maximum number of articles in block (~1s/article). Defaults to 100. num_blocks ((:obj:`int`, optional)): maximum number of blocks to run in session. Defaults to 1. """ user = json.load(open(user_creds, 'r'))["name"] db = open_db_connection(user_creds=user_creds) log = db.elsevier_log elsevier = db.elsevier for i in range(num_blocks): # Verify access at start of each block to detect dropped VPN sessions. verify_access() # Get list of all available blocks sorted from largest to smallest. available_blocks = log.find( { "status": "incomplete", "num_articles": { "$lt": max_block_size } }, ["year", "issn"]).limit(1).sort("num_articles", -1) # Break if no remaining blocks smaller than max_block_size if available_blocks.count() == 0: print( "No remaining blocks with size <= {}.".format(max_block_size)) break else: print("Blocks remaining = {}".format( min(num_blocks - i, available_blocks.count()))) target = available_blocks[0] date = datetime.datetime.now().isoformat() log.update_one( { "year": target["year"], "issn": target["issn"], "status": "incomplete" }, { "$set": { "status": "in progress", "updated_by": user, "updated_on": date } }) # Collect scopus for block print("Collecting entries for Block {}...".format(target["_id"])) dois = find_articles(year=target["year"], issn=target["issn"], get_all=True) new_entries = collect_entries(dois, user) # Insert entries into Matstract database print("Inserting entries into Matstract database...") for entry in tqdm(new_entries): if elsevier.find({"doi": entry["doi"]}).count(): elsevier.update_one({"doi": entry["doi"]}, {"$set": entry}) else: elsevier.insert_one(entry) # Mark block as completed in log date = datetime.datetime.now().isoformat() log.update_one( { "year": target["year"], "issn": target["issn"], "status": "in progress" }, { "$set": { "status": "complete", "completed_by": user, "completed_on": date, "updated_by": user, "updated_on": date } })
def __init__(self, db_name="matstract_db", local=True): self._db = open_db_connection(local=local, db=db_name)