Ejemplos de SimpleParser en Python, ejemplos de matstract.extract.parsing.SimpleParser en Python

Ejemplo n.º 1

0

Mostrar archivo

 def __init__(self):
     #Similar mats
     self.sm = SimilarMaterials()
     # Connect to db
     self.db = AtlasConnection().db
     # Mat parser
     self.parser = SimpleParser()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: entity_cleanup.py Proyecto: maimaiti/matstract

def get_entities(material):
    #Normalize the material
    parser = SimpleParser()
    material = parser.matgen_parser(material)

    #Open connection and get NEs associated with the material
    db = open_db()  #AtlasConnection(db="test").db
    test_ne = db.test_ne
    dois = db.mats_.find({'unique_mats': material}).distinct('doi')
    entities = list(db.test_ne.find({'doi': {'$in': dois}}))
    num_entities = len(entities)

    #Extract the entities
    if entities is not None:
        apl, pro, spl, smt, cmt, dsc = [], [], [], [], [], []
        for doc in entities:
            # Get the properties
            pro.append(doc['PRO'])
            # Get the application
            apl.append(doc['APL'])
            # Get the phase label
            spl.append(doc['SPL'])
            # Get the synthesis method
            smt.append(doc['SMT'])
            # Get the characterization method
            cmt.append(doc['CMT'])
            # Get the characterization method
            dsc.append(doc['DSC'])

        pro = [p for pp in pro for p in pp if len(p) > 2]
        #pro = nltk.FreqDist(pro).most_common(20)
        apl = [p for pp in apl for p in pp if len(p) > 2]
        #apl = nltk.FreqDist(apl).most_common(10)
        spl = [p for pp in spl for p in pp if len(p) > 2]
        #spl = nltk.FreqDist(spl).most_common(3)
        smt = [p for pp in smt for p in pp if len(p) > 2]
        #smt = nltk.FreqDist(smt).most_common(10)
        cmt = [p for pp in cmt for p in pp if len(p) > 2]
        #cmt = nltk.FreqDist(cmt).most_common(10)
        dsc = [p for pp in dsc for p in pp if len(p) > 2]
        #dsc = nltk.FreqDist(dsc).most_common(10)

        entities_dict = {}
        entities_dict['PRO'] = pro
        entities_dict['SPL'] = spl
        entities_dict['SMT'] = smt
        entities_dict['CMT'] = cmt
        entities_dict['APL'] = smt
        entities_dict['DSC'] = cmt

        return entities_dict

Ejemplo n.º 3

0

Mostrar archivo

    def __init__(self):
        #Load the similarity array
        array_url = 'https://s3-us-west-1.amazonaws.com/materialsintelligence/matminer_array.npy'
        ds = np.DataSource()
        ds.open(array_url)
        self.matminer_array = np.load(ds.abspath(array_url))

        #Other data
        mat2index_url = 'https://s3-us-west-1.amazonaws.com/materialsintelligence/mat2index.p'
        index2mat_url = 'https://s3-us-west-1.amazonaws.com/materialsintelligence/index2mat.p'
        scaler_url = 'https://s3-us-west-1.amazonaws.com/materialsintelligence/scaler.p'
        self.mat2index = pickle.load(ds.open(mat2index_url, 'rb'))
        self.index2mat = pickle.load(ds.open(index2mat_url, 'rb'))
        self.scaler = pickle.load(ds.open(scaler_url, 'rb'))

        #Mat parser
        self.parser = SimpleParser()

Ejemplo n.º 4

0

Mostrar archivo

def get_keywords(material):
    db = AtlasConnection(db="test").db
    print(db.info)
    parser = SimpleParser()
    material = parser.matgen_parser(material)
    print("number of materials is", db.keywords.count())
    keywords = db.keywords.find_one({'material': material})
    if keywords is not None:
        tf = keywords['keywords_tf']
        tf_arranged = arrange_keywords(tf)
        tfidf = keywords['keywords_tfidf']
        tfidf_arranged = arrange_keywords(tfidf)
        df = pd.DataFrame()
        df['tf'] = tf_arranged
        df['tfidf'] = tfidf_arranged
        return generate_table(df)
    else:
        return "No keywords for the specified material"

Ejemplo n.º 5

0

Mostrar archivo

def generate_trends_graph(search=None, material=None, layout=None):
    sp = SimpleParser()
    if material is not None:
        material = sp.matgen_parser(material)
    db = AtlasConnection(db="production").db
    pipeline = list()
    pipeline.append({"$match": {"MAT": material}})
    pipeline.append({
        "$lookup": {
            "from": "abstracts",
            "localField": "doi",
            "foreignField": "doi",
            "as": "abstracts"
        }
    })
    pipeline.append({"$match": {"abstracts": {"$ne": []}}})
    pipeline.append({"$unwind": "$abstracts"})
    pipeline.append({"$project": {"year": "$abstracts.year"}})
    pipeline.append({"$project": {"abstracts": 0}})
    pipeline.append({"$group": {"_id": "$year", "count": {"$sum": 1}}})
    res = db.ne_071018.aggregate(pipeline)
    if res is not None:
        results = list(db.ne_071018.aggregate(pipeline))
    else:
        results = []

    results_dict = dict()
    for res in results:
        if int(res["_id"]) in results_dict:
            results_dict[int(res["_id"])] += res["count"]
        else:
            results_dict[int(res["_id"])] = res["count"]

    results = sorted(results_dict.items(), key=lambda x: x[0])

    print(results)
    # results = list(MS.search(text=search, filters=filters, max_results=10000))
    hist = dict()
    if len(results) > 0:
        # histdata = {}
        # years = [int(r["year"]) for r in results]
        # for year in years:
        #     if year in histdata.keys():
        #         histdata[year] += 1
        #     else:
        #         histdata[year] = 1
        # for year in range(min(2000, min(histdata.keys())), 2017):
        #     if not year in histdata.keys():
        #         histdata[year] = 0
        # if 2018 in histdata:
        #     del(histdata[2018])  # TODO remove after demo
        # histdata = sorted(histdata.items(), key=operator.itemgetter(0))

        hist["data"] = [{
            'x': [x[0] for x in results],
            'y': [x[1] for x in results],
            'line': {
                "width": 2,
                "color": 'rgb(0, 0, 0)'
            }
        }]
    else:
        hist["data"] = [{
            'x': [],
            'y': [],
            'line': {
                "width": 2,
                "color": 'rgb(0, 0, 0)'
            }
        }]
    if layout is not None:
        hist["layout"] = layout
    return hist

Ejemplo n.º 6

0

Mostrar archivo

def get_entities(mat, class_name="three columns"):
    # Normalize the material
    parser = SimpleParser()
    material = parser.matgen_parser(mat)

    # Open connection and get NEs associated with the material
    db = AtlasConnection(db="test").db
    entities = list(db.ne_norm.find({'MAT': material}))
    #entities = list(db.ne_norm.find({'doi': {'$in': dois}}))
    num_entities = len(entities)

    # Extract the entities
    if entities is not None:
        apl, pro, spl, smt, cmt, dsc = [], [], [], [], [], []
        for doc in entities:
            # Get the properties
            pro.append(doc['PRO'])
            # Get the application
            apl.append(doc['APL'])
            # Get the SPL
            spl.append(doc['SPL'])
            # Get the synthesis method
            smt.append(doc['SMT'])
            # Get the characterization method
            cmt.append(doc['CMT'])
            # Get the characterization method
            dsc.append(doc['DSC'])

        pro = [p for pp in pro for p in pp if len(p) > 2]
        pro = nltk.FreqDist(pro).most_common(40)
        apl = [p for pp in apl for p in pp if len(p) > 2]
        apl = nltk.FreqDist(apl).most_common(20)
        apl = [(a, score) for a, score in apl
               if a not in ['coating', 'electrode']]
        spl = [p for pp in spl for p in pp if len(p) > 2]
        spl = nltk.FreqDist(spl).most_common(3)
        smt = [p for pp in smt for p in pp if len(p) > 2]
        smt = nltk.FreqDist(smt).most_common(20)
        cmt = [p for pp in cmt for p in pp if len(p) > 2]
        cmt = nltk.FreqDist(cmt).most_common(20)
        dsc = [p for pp in dsc for p in pp if len(p) > 2]
        dsc = nltk.FreqDist(dsc).most_common(20)

        if class_name == "three columns":
            return html.Div([
                html.Div([
                    html.Div(trends_app.display_trends_graph(material),
                             className="six columns"),
                    gen_output(pro, num_entities, 'Property', material,
                               class_name),
                    gen_output(apl, num_entities, 'Application', material,
                               class_name)
                ],
                         className="row"),
                html.Div([
                    gen_output(cmt, num_entities, 'Characterization', material,
                               class_name),
                    gen_output(smt, num_entities, 'Synthesis', material,
                               class_name),
                    gen_output(dsc, num_entities, 'Sample descriptor',
                               material, class_name),
                    gen_output(spl, num_entities, 'Phase', material,
                               class_name)
                ],
                         className="row"),
            ])
        else:
            return html.Div([
                html.Div([
                    gen_output(pro, num_entities, 'Property', material,
                               class_name),
                    gen_output(apl, num_entities, 'Application', material,
                               class_name),
                    gen_output(cmt, num_entities, 'Characterization', material,
                               class_name)
                ],
                         className="row"),
                html.Div([
                    gen_output(smt, num_entities, 'Synthesis', material,
                               class_name),
                    gen_output(dsc, num_entities, 'Sample descriptor',
                               material, class_name),
                    gen_output(spl, num_entities, 'Phase', material,
                               class_name)
                ],
                         className="row"),
            ])
    else:
        return "No entities for the specified material"

Ejemplo n.º 7

0

Mostrar archivo

class SimilarMaterials:
    def __init__(self):
        #Load the similarity array
        array_url = 'https://s3-us-west-1.amazonaws.com/materialsintelligence/matminer_array.npy'
        ds = np.DataSource()
        ds.open(array_url)
        self.matminer_array = np.load(ds.abspath(array_url))

        #Other data
        mat2index_url = 'https://s3-us-west-1.amazonaws.com/materialsintelligence/mat2index.p'
        index2mat_url = 'https://s3-us-west-1.amazonaws.com/materialsintelligence/index2mat.p'
        scaler_url = 'https://s3-us-west-1.amazonaws.com/materialsintelligence/scaler.p'
        self.mat2index = pickle.load(ds.open(mat2index_url, 'rb'))
        self.index2mat = pickle.load(ds.open(index2mat_url, 'rb'))
        self.scaler = pickle.load(ds.open(scaler_url, 'rb'))

        #Mat parser
        self.parser = SimpleParser()

    def get_mat_vector(self, mat):
        comp = Composition(mat)
        mat_vector = []

        # Add element property features
        ep_feat = ElementProperty.from_preset(preset_name="magpie")
        mat_vector += ep_feat.featurize(comp)

        # Oxidation state features
        comp_ox = comp.add_charges_from_oxi_state_guesses()
        os_feat = OxidationStates()
        mat_vector += os_feat.featurize(comp_ox)

        # Loop over other features
        featurizers = [
            AtomicOrbitals, BandCenter, Stoichiometry, ValenceOrbital,
            ElementFraction, TMetalFraction
        ]
        for featurizer in featurizers:
            feat = featurizer()
            mat_vector += feat.featurize(comp)

        mat_vector = np.array([el for el in mat_vector
                               if type(el) != str]).reshape(1, -1)
        mat_vector = self.scaler.transform(mat_vector)

        return mat_vector

    def get_similar_mats(self, mat, num_mats=10):
        normalized_mat = self.parser.matgen_parser(mat)
        mat_vector = self.get_mat_vector(normalized_mat)
        similarity_scores = cosine_similarity(mat_vector,
                                              self.matminer_array).flatten()
        most_similar = list(
            reversed([
                self.index2mat[idx] for idx in np.argsort(similarity_scores)
            ]))[:num_mats + 1]
        try:
            most_similar.remove(normalized_mat)
        except ValueError:
            most_similar = most_similar[:-1]
        return most_similar

Ejemplo n.º 8

0

Mostrar archivo

class SynthesisPredictor:
    '''
    A materials synthesis prediction tool.

    Usage is as follows:
    >>> sp = SynthesisPredictor()
    >>> summary = sp.get_synthesis_summary('LiFePO4')
    >>> sp.print_synthesis_summary('LiFePO4')
    '''
    def __init__(self):
        #Similar mats
        self.sm = SimilarMaterials()
        # Connect to db
        self.db = AtlasConnection().db
        # Mat parser
        self.parser = SimpleParser()

    def similar_mat_synthesis(self, mat, num_mats=10, num_smt=20):
        '''
        Find similar metraials and print their synthesis methods
        :param mat: stirng; chemical formula
        :param num_mats: int; number of similar materials to consider
        :param num_smt: int; number of synthesis methods to return
        :return: smt_list: list; list of synthesis methods
        '''
        similar_mats = self.sm.get_similar_mats(mat, num_mats)
        smt_list = self.get_synthesis(similar_mats, num_smt)
        return smt_list

    def get_synthesis(self, mat_list, num_smt=20):
        '''
        Get the most common synthesis methods for a list of materials
        :param mat_list: list; list of material formuale
        :param num_smt: int; number of synthesis methods to return
        :return: smt_list: list; list of synthesis methods
        '''
        mat_list_norm = [self.parser.matgen_parser(_mat) for _mat in mat_list]
        docs = list(self.db.ne_norm.find({'MAT': {'$in': mat_list_norm}}))
        num_docs = len(docs)
        SMT = [
            smt for doc in docs for smt in list(set(doc['SMT']))
            if smt not in rm_list
        ]
        fd = nltk.FreqDist(SMT)
        smt_list = [(item, score / num_docs)
                    for item, score in fd.most_common(num_smt)]
        return smt_list

    def get_synthesis_summary(self, mat):
        '''
        Get a synthesis summary for a material
        :param mat: string; chemcial formula
        :return: synthesis_summary: dict; a summary of synthesis for the input mat
        '''
        synthesis_summary = {}
        synthesis_summary['mat_synthesis'] = self.get_synthesis([mat])
        synthesis_summary['similar_mats'] = self.sm.get_similar_mats(mat)
        synthesis_summary[
            'similar_mat_synthesis'] = self.similar_mat_synthesis(mat)
        return synthesis_summary

    def print_synthesis_summary(self, mat):
        '''
        Prints a synthesis summary for a material
        :param mat: string; chemical formula
        '''
        # Get the synthesis for the current mat
        print('##############################')
        print('Synthesis summary for {}'.format(mat))
        print('##############################')
        print('\n')

        print('Common synthesis methods for {}:'.format(mat))
        print(
            '-----------------------------------------------------------------------------'
        )
        smt = self.get_synthesis([mat])
        if smt:
            print('{:<70} {:>6}'.format('SMT', 'SCORE'))
            print(
                '-----------------------------------------------------------------------------'
            )
            for smt, score in smt:
                print('{:<70} {:>6.3f}'.format(smt, score))
        else:
            print('{} not found in database...'.format(mat))
        print('\n')

        # Get the similar mats
        print('Similar materials to {}:'.format(mat))
        print('----------')
        similar_mats = self.sm.get_similar_mats(mat)
        print('{:<70}'.format('MAT'))
        print('----------')
        for _mat in similar_mats:
            print('{:<70}'.format(_mat))
        print('\n')

        # Synthesis for similar materials
        print('Common synthesis methods for {} similar materials:'.format(mat))
        print(
            '-----------------------------------------------------------------------------'
        )
        smt = self.similar_mat_synthesis(mat)
        if smt:
            print('{:<70} {:>6}'.format('SMT', 'SCORE'))
            print(
                '-----------------------------------------------------------------------------'
            )
            for smt, score in smt:
                print('{:<70} {:>6.3f}'.format(smt, score))
        else:
            print('{} not found in database...'.format(mat))
        print('\n')

Ejemplo n.º 9

0

Mostrar archivo

def get_entities(material):
    # Normalize the material
    parser = SimpleParser()
    material = parser.matgen_parser(material)

    # Open connection and get NEs associated with the material
    db = AtlasConnection(db="test").db
    dois = db.mats_.find({'unique_mats': material}).distinct('doi')
    entities = list(db.ne.find({'doi': {'$in': dois}}))
    num_entities = len(entities)

    # Extract the entities
    if entities is not None:
        apl, pro, spl, smt, cmt, dsc = [], [], [], [], [], []
        for doc in entities:
            # Get the properties
            pro.append(doc['PRO'])
            # Get the application
            apl.append(doc['APL'])
            spl.append(doc['SPL'])
            # Get the synthesis method
            smt.append(doc['SMT'])
            # Get the characterization method
            cmt.append(doc['CMT'])
            # Get the characterization method
            dsc.append(doc['DSC'])

        pro = [
            pro_dict[p] for pp in pro for p in pp
            if len(p) > 2 and p in pro_dict.keys()
        ]
        pro = nltk.FreqDist(pro).most_common(20)
        apl = [
            apl_dict[p] for pp in apl for p in pp
            if len(p) > 2 and p in apl_dict.keys()
        ]
        apl = nltk.FreqDist(apl).most_common(10)
        spl = [p for pp in spl for p in pp if len(p) > 2]
        spl = nltk.FreqDist(spl).most_common(3)
        smt = [
            smt_dict[p] for pp in smt for p in pp
            if len(p) > 2 and p in smt_dict.keys()
        ]
        smt = nltk.FreqDist(smt).most_common(10)
        cmt = [
            cmt_dict[p] for pp in cmt for p in pp
            if len(p) > 2 and p in cmt_dict.keys()
        ]
        cmt = nltk.FreqDist(cmt).most_common(10)
        dsc = [
            dsc_dict[p] for pp in dsc for p in pp
            if len(p) > 2 and p in dsc_dict.keys()
        ]
        dsc = nltk.FreqDist(dsc).most_common(10)

        return html.Div([
            gen_output(pro, num_entities, 'Property', material),
            gen_output(cmt, num_entities, 'Characterization', material),
            gen_output(smt, num_entities, 'Synthesis', material),
            gen_output(spl, num_entities, 'Phase', material),
            gen_output(apl, num_entities, 'Application', material),
            gen_output(dsc, num_entities, 'Sample descriptor', material),
        ])
    else:
        return "No entities for the specified material"