Exemple #1
0
def get_content(DOI, refresh=True, *args, **kwds):
    """ Helper function to read file content as xml.

    Args:
        input_doi (str): DOI of article
        *args:
        **kwds:

    Returns:
        Content of returned XML file

    """

    if not refresh:
        db = open_db_connection()
        elsevier = db.elsevier
        entries = elsevier.find({"doi": DOI})
        if len(entries):
            if len(entries) > 1:
                print(
                    "More than one entry for given DOI! Only using only first entry."
                )
            entry = entries[0]
            if entry["collected"]:
                content = entry["xml"]
                return content
    content = download(*args, **kwds).text
    return content
Exemple #2
0
def search_for_material(material, search):
    db = open_db_connection()
    if search:
        results = db.abstracts.find({"$text": {"$search": search}, "chem_mentions.names": material}, ["year"])
    else:
        results = db.abstracts.find({"chem_mentions.names": material}, ["year"])
    return list(results)
Exemple #3
0
def search_for_topic(search):
    db = open_db_connection(db="matstract_db")
    if search:
        results = db.abstracts.find({"$or": [{"title": {"$regex": ".*{}.*".format(search)}},
                                             {"abstract": {"$regex": ".*{}.*".format(search)}}]}, ["year"])
        print(results.count())
        return list(results)
    else:
        return []
Exemple #4
0
def get_keywords(material):
    db = open_db_connection(db="tri_abstracts")
    print(db.info)
    parser = SimpleParser()
    material = parser.matgen_parser(material)
    print("number of materials is", db.keywords.count())
    keywords = db.keywords.find_one({'material': material})
    if keywords is not None:
        tf = keywords['keywords_tf']
        tf_arranged = arrange_keywords(tf)
        tfidf = keywords['keywords_tfidf']
        tfidf_arranged = arrange_keywords(tfidf)
        df = pd.DataFrame()
        df['tf'] = tf_arranged
        df['tfidf'] = tfidf_arranged
        return generate_table(df)
    else:
        return "No keywords for the specified material"
Exemple #5
0
def check_scopus_collection(year, issn):
    """
    Checks the scopus_log collection on MongoDB whether the data for a given year/journal combination
    has been collected.

    Args:
        year: (str) year
        issn: (str) issn of journal

    Returns:
        (bool) True if status of the year/journal pair is "complete"

    """
    db = open_db_connection()
    log = db.elsevier_log
    entry = log.find({"year": year, "issn": issn})[0]
    if entry["status"] == "complete":
        return True
    elif entry["status"] == "incomplete":
        return False
    else:
        raise KeyError("Entry has no status!")
Exemple #6
0
import dash_html_components as html
import dash_core_components as dcc
import pandas as pd
import random
from matstract.utils import open_db_connection, open_es_client
from matstract.extract import parsing
from bson import ObjectId

db = open_db_connection(db="matstract_db")
client = open_es_client()


def random_abstract():
    random_document = list(db.abstracts.aggregate([{
        "$sample": {
            "size": 1
        }
    }]))[0]
    return random_document['abstract']


def sort_results(results, ids):
    results_sorted = sorted(results, key=lambda k: ids.index(k['_id']))
    return results_sorted


def highlight_material(body, material):
    highlighted_phrase = html.Mark(material)
    if len(material) > 0 and material in body:
        chopped = body.split(material)
        newtext = []
Exemple #7
0
import dash_html_components as html
import dash_materialsintelligence as dmi

from flask_caching import Cache

from flask import send_from_directory
from matstract.web.view import annotate_app, similar_app, \
    search_app, keyword_app, extract_app, trends_app
from dash.dependencies import Input, Output, State
from matstract.web.callbacks import search_callbacks, annotate_callbacks, \
    extract_callbacks, keyword_callbacks, trends_callbacks, similar_callbacks
from matstract.utils import open_db_connection

import os

db = open_db_connection(local=True)

app = dash.Dash()
server = app.server

# To include local css and js files
app.css.config.serve_locally = True
app.scripts.config.serve_locally = True
app.config.suppress_callback_exceptions = True
app.title = "Matstract"

cache = Cache(server, config={"CACHE_TYPE": "simple"})

### CSS settings ###
BACKGROUND = 'rgb(230, 230, 230)'
COLORSCALE = [[0, "rgb(244,236,21)"], [0.3, "rgb(249,210,41)"], [0.4, "rgb(134,191,118)"],
Exemple #8
0
from dash.dependencies import Input, Output, State
import dash_html_components as html
from matstract.web.view import extract_app
from matstract.utils import open_db_connection

db = open_db_connection(local=True, db="matstract_db")


def bind(app):
    ### Extract App Callbacks ###
    @app.callback(Output('extract-highlighted',
                         'children'), [Input('extract-button', 'n_clicks')],
                  [State('extract-textarea', 'value')])
    def highlight_extracted(n_clicks, text):
        if n_clicks is not None:
            text, tags = extract_app.extract_ne(text)
            spaced_tags = []
            for tag in tags:
                #spaced_tags += [tag, html.Span()]
                span = html.Span(tag)
                span.style = {'padding-right': '15px'}
                spaced_tags.append(span)
            return html.Div(text), html.Br(), html.Div(
                html.Label('Extracted Entity tags:')), html.Div(spaced_tags)

    @app.callback(
        Output('extract-textarea', 'value'),
        # Output('similar-textarea', 'value'),
        [Input("extract-random", 'n_clicks')])
    def get_random(n_clicks):
        if n_clicks is not None:
from dash.dependencies import Input, Output, State
from matstract.web.view import similar_app
from matstract.utils import open_db_connection

db = open_db_connection(db="matstract_db", local=True)


def bind(app, cache):
    @app.callback(
        Output('similar-textarea', 'value'),
        # Output('similar-textarea', 'value'),
        [Input("similar-random", 'n_clicks')])
    def get_random(n_clicks):
        if n_clicks is not None:
            text = similar_app.random_abstract()
            return text
        return ""


    @cache.memoize(timeout=600)
    @app.callback(
        Output('similar-table', 'children'),
        # [Input('search-box', 'value')])
        [Input('similar-button', 'n_clicks')],
        [State('similar-textarea', 'value'), State('similar-material-box', 'value')])
    def update_table(n_clicks, search, material):
        if material is not None:
            table = similar_app.generate_table(search, material)
        else:
            table = similar_app.generate_table(search)
        return table
 def __init__(self, local=False):
     self._db = open_db_connection(access="annotator", local=local)
    return kwds_tf


def extract_tfidf(list_of_abstracts, db_l, count=5):
    kwds_tfidf = {}
    for tt in ['unigrams', 'bigrams', 'trigrams']:
        tf = TermFrequency(normalize=True,
                           first_last_sentence_only=True,
                           token_type=tt)
        tf.fit_tf(list_of_abstracts)
        most_common = tf.sorted_frequencies[:20]
        idf_scores = [(word, idf_mongo(db_l, word) * score)
                      for (word, score) in most_common]
        top_idf = sorted(idf_scores, key=itemgetter(1), reverse=True)[:count]
        kwds_tfidf[tt] = top_idf
    return kwds_tfidf


if __name__ == '__main__':
    db = open_db_connection()
    material = 'GaN'
    result = get_search_results(material=material)
    abstracts = [doc['abstract'] for doc in result]
    # Extract term frequencies
    term_frequencies = extract_tf(abstracts, count=5)
    # Extract tfidf
    tfidf = extract_tfidf(abstracts, db, count=5)
    for n_grams in ['unigrams', 'bigrams', 'trigrams']:
        print('####', n_grams, '####', sep='\n')
        for tf, tf_idf in zip(term_frequencies[n_grams], tfidf[n_grams]):
            print(tf, tf_idf)
Exemple #12
0
def contribute(user_creds="matstract/atlas_creds.json",
               max_block_size=100,
               num_blocks=1):
    """
    Gets a incomplete year/journal combination from elsevier_log, queries for the corresponding
    dois, and downloads the corresponding xmls for each to the elsevier collection.

    Args:
        user_creds ((:obj:`str`, optional)): path to contributing user's write-permitted credential file.
        max_block_size ((:obj:`int`, optional)): maximum number of articles in block (~1s/article). Defaults to 100.
        num_blocks ((:obj:`int`, optional)): maximum number of blocks to run in session. Defaults to 1.

    """
    user = json.load(open(user_creds, 'r'))["name"]
    db = open_db_connection(user_creds=user_creds)
    log = db.elsevier_log
    elsevier = db.elsevier

    for i in range(num_blocks):
        # Verify access at start of each block to detect dropped VPN sessions.
        verify_access()

        # Get list of all available blocks sorted from largest to smallest.
        available_blocks = log.find(
            {
                "status": "incomplete",
                "num_articles": {
                    "$lt": max_block_size
                }
            }, ["year", "issn"]).limit(1).sort("num_articles", -1)

        # Break if no remaining blocks smaller than max_block_size
        if available_blocks.count() == 0:
            print(
                "No remaining blocks with size <= {}.".format(max_block_size))
            break
        else:
            print("Blocks remaining = {}".format(
                min(num_blocks - i, available_blocks.count())))

        target = available_blocks[0]
        date = datetime.datetime.now().isoformat()
        log.update_one(
            {
                "year": target["year"],
                "issn": target["issn"],
                "status": "incomplete"
            }, {
                "$set": {
                    "status": "in progress",
                    "updated_by": user,
                    "updated_on": date
                }
            })

        # Collect scopus for block
        print("Collecting entries for Block {}...".format(target["_id"]))
        dois = find_articles(year=target["year"],
                             issn=target["issn"],
                             get_all=True)
        new_entries = collect_entries(dois, user)

        # Insert entries into Matstract database
        print("Inserting entries into Matstract database...")
        for entry in tqdm(new_entries):
            if elsevier.find({"doi": entry["doi"]}).count():
                elsevier.update_one({"doi": entry["doi"]}, {"$set": entry})
            else:
                elsevier.insert_one(entry)

        # Mark block as completed in log
        date = datetime.datetime.now().isoformat()
        log.update_one(
            {
                "year": target["year"],
                "issn": target["issn"],
                "status": "in progress"
            }, {
                "$set": {
                    "status": "complete",
                    "completed_by": user,
                    "completed_on": date,
                    "updated_by": user,
                    "updated_on": date
                }
            })
 def __init__(self, db_name="matstract_db", local=True):
     self._db = open_db_connection(local=local, db=db_name)