コード例 #1
0
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite')
コード例 #2
0
def parse_medline_articles(path='medline', 
                           saved_path='parsed_articles', 
                           year_start=2000, 
                           year_stop=2018):
    """
    Give a ``path`` to folder locating .xml.gz of Medline articles, 
    parse and save the parsed articles to ``saved_path`` 

    Input
    =====
    path: str, path to folder with all .xml.gz files
    saved_path: str, path to saved articles
    year_start: int, first year to save parsed article
    year_stop: int, last year to save parsed article
    """
    paths = glob(os.path.join(path, '*.xml.gz'))

    # check if the directory is not there
    if not os.path.isdir(saved_path):
        os.mkdir(saved_path)

    for i, path in enumerate(paths):
        all_parsed_papers = []
        parsed_papers = pp.parse_medline_xml(path)
        for paper in parsed_papers:
            try:
                if int(paper['pubdate']) >= year_start and int(paper['pubdate']) <= year_stop:
                    all_parsed_papers.append(paper)
            except:
                pass
        save_json(all_parsed_papers, os.path.join(saved_path, 'parsed_%d.json' % i))
    print('done!')
コード例 #3
0
def medline2txt(xml_in, file):
        analyze_out = pp.parse_medline_xml(xml_in)
	bcnt = 0
        # print 'Medline2Txt', xml_in

        for paper in analyze_out:
			

		title = paper['title'].encode('utf-8').replace('\n', ' ')
		if isinstance(title, unicode):
			title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore')

		title = re.sub(r'\s\s+', ' ', title)

		abstract = paper['abstract'].encode('utf-8').replace('\n', ' ')
		if isinstance(abstract, unicode):
			abstract = unicodedata.normalize('NFKD', abstract).encode('ascii', 'ignore')

		abstract = re.sub(r'\s\s+', ' ', abstract)
		
		text = '%s %s\n' % (title, abstract) # PWTEES FORMAT
		file.write(text)

		bcnt = bcnt + 1
		if bcnt % 10000 == 0 :
			print bcnt, 'medline records inserted.'
コード例 #4
0
def article_stream(path, batch_size):
    lines = []
    for line in open(path, "r"):
        lines.append(line.strip())

    pmid_batches = []
    for batch in make_batches(lines, batch_size):
        pmid_batches.append(batch)

    pmid_lists = []
    for batch in pmid_batches:
        pmid_lists.append(",".join(batch))

    for pmid_list in pmid_lists:
        print("Doing " + pmid_list.replace(",", ", ") + "...")

        api_url = build_api_url(pmid_list, retmode="xml")
        res = requests.get(api_url)

        if res.status_code != 200:
            raise requests.HTTPError(res.reason)

        with open("xml/tmp.xml", "w") as f:
            f.write(res.text)

        medline_json = pp.parse_medline_xml("xml/tmp.xml")

        for article in medline_json:
            yield article
コード例 #5
0
def medline2txt(xml_in, pmids, job_size, output_dir):
    analyze_out = pp.parse_medline_xml(xml_in)
    bcnt = 0

    print 'Medline2Txt', xml_in

    for paper in analyze_out:
        pmid = paper['pmid']
        sub_dir = '%s/%d' % (output_dir, int(pmid) % job_size)

        if paper['pmid'] not in pmids:
            continue

        title = paper['title'].encode('utf-8').replace('\n', ' ')
        title = u2a_convert(pmid, title, 'title')

        abstract = ''
        if paper['abstract'] is not None:
            abstract = paper['abstract'].encode('utf-8').replace('\n', ' ')
            abstract = u2a_convert(pmid, abstract, 'abstract')
        else:
            print 'Cannot find abstract for PMID %s' % pmid

        f_tmp_in_fn = '%s/%s.txt' % (sub_dir, pmid)
        f_tmp_in = open(f_tmp_in_fn, 'w')
        #text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format
        text = '%s %s' % (title, abstract)  # PWTEES FORMAT
        f_tmp_in.write(text)
        f_tmp_in.close()

        bcnt = bcnt + 1
        if bcnt % 1000 == 0:
            print bcnt, 'medline records inserted.'
コード例 #6
0
def medline2txt(xml_in, pmids, job_size):
        analyze_out = pp.parse_medline_xml(xml_in)
        bcnt = 0

        print 'Medline2Txt', xml_in

	sub_dir = 'input_w/'
	if not os.path.exists(sub_dir):
		os.makedirs(sub_dir)
        f_tmp_in_fn = '%s/medline.txt' % (sub_dir)
        f_tmp_in = open(f_tmp_in_fn, 'w')


        for paper in analyze_out:
		if paper['abstract'] is None:
			#print "abisnone: ",paper['pmid']
			continue
		abstract = paper['abstract'].encode('utf-8').replace('\n', ' ')
		if isinstance(abstract, unicode):
			abstract = unicodedata.normalize('NFKD', abstract).encode('ascii', 'ignore')

			
		#text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format
		text = '%s\n' % (abstract) # PWTEES FORMAT
		f_tmp_in.write(text)

		#bcnt = bcnt + 1
		#if bcnt % 1000 == 0 :
			#print bcnt, 'medline records inserted.'
	f_tmp_in.close()
コード例 #7
0
def download_from_pmid_list(path, batch_size=1):

    lines = []
    for line in open("pmids/example.txt", "r"):
        lines.append(line.strip())

    pmid_batches = []
    for batch in make_batches(lines, batch_size):
        pmid_batches.append(batch)

    pmid_lists = []
    for batch in pmid_batches:
        pmid_lists.append(",".join(batch))

    print()

    for pmid_list in pmid_lists:
        print("———————" + pmid_list + "———————")
        print()

        api_url = build_api_url(pmid_list, retmode="xml")
        res = requests.get(api_url)

        if res.status_code != 200:
            raise HTTPError(res.reason)

        d = xmltodict.parse(res.text)

        # with open("xml/tmp.xml", "w") as f:
        #     f.write(res.text)

        # medline_json = pp.parse_medline_xml("xml/tmp.xml")

        # for article in medline_json:
        #     with open("json/{}.json".format(article["pmid"]), "w") as f:
        #         f.write(json.dumps(article, indent=2))

        articles = d["PubmedArticleSet"]["PubmedArticle"]

        # When using xmltodict to convert a multiple-article request, the
        # PubmedArticle tag becomes a list instead of a dict.
        if isinstance(articles, dict):
            articles = [articles]

        for article in articles:
            out = {"PubmedArticleSet": {"PubmedArticle": article}}
            pmid = article["MedlineCitation"]["PMID"]["#text"]
            xml = xmltodict.unparse(out, pretty=True, full_document=False)

            # todo: write to /tmp/ instead?
            with open("xml/{}.xml".format(pmid), "w") as f:
                f.write(xml)

            medline_json = pp.parse_medline_xml("xml/{}.xml".format(pmid))

            with open("json/{}.json".format(pmid), "w") as f:
                f.write(json.dumps(medline_json, indent=2))

        shutil.rmtree("xml")
コード例 #8
0
def sents2redis(xml_in, redis_server):
    analyze_out = pp.parse_medline_xml(xml_in)
    bcnt = 0

    r = redis.StrictRedis(host='%s' % redis_server, port=6379, db=0)
    pipe = r.pipeline()

    print 'Medline2Redis', xml_in

    for paper in analyze_out:
        pmid = paper['pmid']

        title = paper['title'].encode('utf-8').replace('\n', ' ')
        title = u2a_convert(pmid, title, 'title')

        abstract = ''
        if paper['abstract'] is not None:
            abstract = paper['abstract'].encode('utf-8').replace('\n', ' ')
            abstract = u2a_convert(pmid, abstract, 'abstract')
        else:
            print 'Cannot find abstract for PMID %s' % pmid

        #affiliation: corresponding author's affiliation
        #authors: authors, each separated by ;
        #mesh_terms: list of MeSH terms, each separated by ;
        #keywords: list of keywords, each separated by ;
        #pubdate: Publication date. Defaults to year information only.
        year = paper['pubdate']
        author = paper['author']
        keywords = paper['keywords']
        mesh_terms = paper['mesh_terms']
        affiliation = paper['affiliation']
        journal = paper['journal']

        pipe.set('%s:title' % pmid, '%s' % title)
        pipe.set('%s:abstract' % pmid, '%s' % abstract)
        pipe.set('%s:pubtator' % pmid,
                 '%s|t|%s\n%s|a|%s' % (pmid, title, pmid, abstract))
        pipe.set('%s:pubdate' % pmid, year)
        pipe.set('%s:author' % pmid, author)
        pipe.set('%s:mesh_terms' % pmid, mesh_terms)
        pipe.set('%s:keywords' % pmid, keywords)
        pipe.set('%s:affiliation' % pmid, affiliation)
        pipe.set('%s:journal' % pmid, journal)

        txt_str = '%s %s' % (title, abstract)
        sents = get_sentences_by_geniass(pmid, txt_str)
        scnt = 0
        for sent in sents:
            pipe.set('%s:sentence:%d' % (pmid, scnt), sent)
            scnt += 1

        bcnt = bcnt + 1
        if bcnt % 100 == 0:
            print bcnt, 'cnv medline records inserted.'

    pipe.execute()
コード例 #9
0
def stuff(t):
    with open("xml/tmp.xml", "w") as f:
        f.write(t)

    medline_json = pp.parse_medline_xml("xml/tmp.xml")

    for article in medline_json:
        with open("json/{}.json".format(article["pmid"]), "w") as f:
            f.write(json.dumps(article, indent=2))
コード例 #10
0
 def read_and_index_articles_file(self, infile_):
     infile = str(infile_)
     print("Reading %s " % infile)
     if infile.endswith(".xml.gz"):
         f = gzip.open(infile, 'rb')
     elif infile.endswith(".xml"):
         f = open(infile, 'rb')
     else:
         print(
             "Ignoring '%s': filename does not end with '.xml' or '.xml.gz'"
             % infile)
         return
     articles = pp.parse_medline_xml(f)
     listattrs = [
         'authors', 'mesh_terms', 'publication_types', 'chemical_list',
         'keywords', 'references', 'affiliations'
     ]
     ids = set()
     deletedrecords, deletedpmids = list(), list()
     for i, ar in enumerate(articles):
         if ar['delete']:
             # DeleteCitation entries at the end of the xml archive files
             # are parsed to an object with field values set to float NaN
             deletedrecords.append(i)
             deletedpmids.append(ar['pmid'])
             continue
         try:
             num(ar, 'pmc')
         except ValueError:
             ar['pmc'] = 2000
         ar['_id'] = num(ar, 'pmid')
         ids.add(ar['_id'])
         try:
             ar['pubdate'] = datetime.datetime(int(ar['pubdate']), 1, 1)
         except ValueError:
             print(ar['pubdate'])
             ar['pubdate'] = datetime.datetime(2000, 1, 1)
         for listattr in listattrs:
             if len(ar[listattr]) == 0:
                 del ar[listattr]
             else:
                 spr = ';' if listattr in ['authors', 'references'
                                           ] else '; '
                 ar[listattr] = ar[listattr].split(spr)
     for i in reversed(deletedrecords):
         del articles[i]
     self.qry.deletepubmedids(deletedpmids)
     if self.db == "Elasticsearch":
         if not self.qry.checkpubmedidsindexed(list(ids)):
             self.es_index(articles)
         else:
             print("Records in %s looks have been indexed, skipping" %
                   infile)
     else:  # assume MongoDB
         self.mdb_index(articles)
コード例 #11
0
def read_xml_to_dict(folder_to_xmls, 
                     all_xml_files = None,
                     keys_to_parse = None
                     ):
    '''
    Read xml data in dict and store the desired dict values corresponding to
    the values specified in the list keys_to_parse
    
    Input
    -----
    folder_to_xmls: pathlib.PosixPath object denoting the path to the folder 
        containing all the .xml files to be read
        
    all_xml_files: list of str, denoting the file names to be read in the 
        folder_to_xmls. Can be obtained from get_files_in_folder() 
        
    keys_to_parse: list of str, denoting the keys of the dictionary holding 
        all the xml data from each file. The dictionary is created from
        parse_medline_xml() part of the pubmed_parser package 
    
    Output
    ------
    all_values: list of len(keys_to_parse) of lists L
        all_values[i] contains a list L with all the values corresponding to
        key=keys_to_parse[i]. 
        len(L) depends on the data contained in the .xml 
        files that will be read.
    '''
    xml_file = [] #keep here the xml file name from which the data are read
    
    # Initialize a list with N empty lists with N=len(keys_to_parse) 
    # This is where we store all the valeus from the keys_to_parse keys 
    # for a every dict
    all_values = [[]] * len(keys_to_parse)
    for current_xml in all_xml_files:
        print('\nIterating file...:', current_xml)
        # Normaly parse_medline_xml() should work (since we get data from pubmed), 
        # but this does not appear to be the case. 
        # Instead parse_medline_xml() gets the desired info from the xml files. 
        # To be further checked.  
        dicts_out = pp.parse_medline_xml(str(folder_to_xmls/current_xml))
        for d in dicts_out:
            for i, key in enumerate(keys_to_parse):
                try:
                    if all_values[i]:
                        all_values[i].append(d[key])
                    else:
                        # Store the value of key in a list so it can be further 
                        # appended in the loop
                        all_values[i] = [d[key]]
                    xml_file.append(current_xml)# keep xml file name
                except:
                    print('\nKey ', key, ' not found!')
                      
    return all_values, xml_file    
コード例 #12
0
def medline2redis(xml_in, pmids, redis_server):
	analyze_out = pp.parse_medline_xml(xml_in)
	cnt=0
	bcnt = 0

	r = redis.StrictRedis(host='%s' % redis_server, port=6379, db=0)
	pipe = r.pipeline()

	print 'Medline2Redis', xml_in
	
	for paper in analyze_out:
		cnt = cnt + 1

		if cnt % 1000 == 0:
			print cnt, "medline records processed"

		pmid = paper['pmid']

		if paper['pmid'] not in pmids.keys():
			continue

		title = paper['title'].encode('utf-8').replace('\n', ' ')
		title = u2a_convert(pmid, title, "title")

		abstract = paper['abstract'].encode('utf-8').replace('\n', ' ')
		abstract = u2a_convert(pmid, abstract, "abstract")

		#affiliation: corresponding author's affiliation
		#authors: authors, each separated by ;
		#mesh_terms: list of MeSH terms, each separated by ;
		#keywords: list of keywords, each separated by ;
		#pubdate: Publication date. Defaults to year information only.
		year = paper['pubdate']
		author = paper['author']
		keywords = paper['keywords']
		mesh_terms = paper['mesh_terms']
		affiliation = paper['affiliation']
		journal = paper['journal']

		pipe.set('%s:title' % pmid, '%s' % title)
		pipe.set('%s:abstract' % pmid, '%s' % abstract)
		pipe.set('%s:pubtator' % pmid, '%s|t|%s\n%s|a|%s' % (pmid, title, pmid, abstract))
		pipe.set('%s:pubdate' % pmid, year)
		pipe.set('%s:author' % pmid, author)
		pipe.set('%s:mesh_terms' % pmid, mesh_terms)
		pipe.set('%s:keywords' % pmid, keywords)
		pipe.set('%s:affiliation' % pmid, affiliation)
		pipe.set('%s:journal' % pmid, journal)

		bcnt = bcnt + 1
		if bcnt % 100 == 0 :
			print bcnt, 'cnv medline records inserted.'

	pipe.execute()
コード例 #13
0
def parse_medline_xml(xml_file, output_file):
    """Import medline XML file into prophet database."""
    # For medline
    import pubmed_parser as pp
    dicts_out = pp.parse_medline_xml(xml_file,
                                     year_info_only=False,
                                     nlm_category=True,
                                     author_list=True,
                                     reference_list=True)

    with open(output_file, 'w') as fp:
        json.dump(dicts_out, fp, cls=DateEncoder)
コード例 #14
0
def main(psql=True):
    input_file_path = "data/pubmed/pubmed20n0340.xml.gz"

    logging.info(f"Processing articles from {input_file_path}.")
    article_dicts = pp.parse_medline_xml(input_file_path,
                                         year_info_only=False,
                                         author_list=False,
                                         reference_list=True)
    logging.info(f"Loaded articles from {input_file_path}.")

    global_init()
    for ad in article_dicts:
        logging.info(f"Processing article {ad['pmid']}.")
        article = Article()
        article.id = "PMID:" + ad["pmid"]
        article.version = "v1"
        article.source = "PubMed"
        article.journal = ad["journal"]
        article.article_type = "postprint"
        article.title = ad["title"]
        pubdate = ad["pubdate"]
        pubdate_dashes = pubdate.count("-")
        if pubdate_dashes == 2:  # format of parsed pubdate is YYYY-MM-DD
            article.publication_date = date.fromisoformat(pubdate)
        elif pubdate_dashes == 1:  # format of parsed pubdate is YYYY-MM
            article.publication_date = date.fromisoformat(pubdate + "-01")
        else:  # format of parsed pubdate is YYYY
            article.publication_date = date.fromisoformat(pubdate + "-01-01")
        article.update_date = date.today()
        article.modified_date = datetime.now()
        article.link = "https://pubmed.ncbi.nlm.nih.gov/" + ad["pmid"]
        article.pmid = ad["pmid"]
        article.doi = ad["doi"]
        article.summary = ad["abstract"]
        article.full_text = ""
        article.authors = [
            x.strip() for x in ad["authors"].split(";") if x != ""
        ]
        article.affiliations = [ad["affiliations"]]
        article.language = ""
        article.keywords = [
            x.strip() for x in ad["keywords"].split(";") if x != ""
        ]
        article.references = [r["pmid"] for r in ad["references"]]
        article.tags = [
            x.strip()
            for k in ["mesh_terms", "publication_types", "chemical_list"]
            for x in ad[k].split(";") if x != ""
        ]
        if psql:
            with session_scope() as sess:  # TODO move to outer for loop
                sess.add(article)
コード例 #15
0
def write_articles(articles):
    for article in articles:
        out = {"PubmedArticleSet": {"PubmedArticle": article}}
        pmid = article["MedlineCitation"]["PMID"]["#text"]
        xml = xmltodict.unparse(out, pretty=True, full_document=False)

        # todo: write to /tmp/ instead?
        with open("xml/{}.xml".format(pmid), "w") as f:
            f.write(xml)

        medline_json = pp.parse_medline_xml("xml/{}.xml".format(pmid))

        with open("json/{}.json".format(pmid), "w") as f:
            f.write(json.dumps(medline_json, indent=2))
コード例 #16
0
def parse_results_map(key):
    """Parse MEDLINE XML file"""
    # Extract name of file from key
    key_name = key.name.encode('utf-8')
    data_file = os.path.basename(key_name)
    # Download file from S3 bucket
    key.get_contents_to_filename(data_file)
    # Parse file
    temp = [
        Row(file_name=os.path.basename(data_file), **publication_dict)
        for publication_dict in pp.parse_medline_xml(data_file)
    ]
    # Delete file from local directory
    subprocess.call(['rm', '-rf', data_file])
    return temp
コード例 #17
0
def test_parse_medline_xml():
    """
    Test parsing MEDLINE XML
    """
    expected_title = "Monitoring of bacteriological contamination and as"
    expected_abstract = "Two hundred and sixty nine beef, 230 sheep and 165"

    parsed_medline = pp.parse_medline_xml(
        os.path.join("data", "pubmed20n0014.xml.gz"))
    assert isinstance(parsed_medline, list)
    assert len(parsed_medline) == 30000, "Expect to have 30000 records"
    assert (len([p for p in parsed_medline if len(p["title"]) > 0
                 ]) == 30000), "Expect every records to have title"
    assert parsed_medline[0]["title"][0:50] == expected_title
    assert parsed_medline[0]["abstract"][0:50] == expected_abstract
    assert parsed_medline[0]["pmid"] == "399296"
コード例 #18
0
def _download_data(api_url: str):
    res = requests.get(api_url)
    if res.status_code != 200:
        raise requests.HTTPError(res.reason)

    with open("{}/medline.xml".format(_tmp_dir), "w") as f:
        f.write(res.text)

    medline_json_list = pp.parse_medline_xml("{}/medline.xml".format(_tmp_dir))

    # Map PMID to article
    new_data = {}
    for article in medline_json_list:
        new_data[article["pmid"]] = article

    return new_data
コード例 #19
0
def process_open_xml(proc_id, xml_files, output_dir):
    import pubmed_parser as pp

    def filter_mesh(string):
        return " ".join(
            map(lambda y: y[0],
                map(lambda x: x.split(";"),
                    string.split(":")[1:])))

    print("[Process-{}] Started".format(proc_id))
    articles = []
    for file_name in xml_files:
        print(proc_id, file_name)
        try:
            articles.extend(
                pp.parse_medline_xml(file_name,
                                     year_info_only=False,
                                     nlm_category=False))
        except etree.XMLSyntaxError:
            print("Error on File " + file_name)

        gc.collect()

    articles_filter = filter(
        lambda x: (x["abstract"] is not None and len(x["abstract"]) > 0 and x[
            "pubdate"] != ""), articles)

    articles_mapped = list(
        map(
            lambda x: {
                "id": x["pmid"],
                "title": x["title"],
                "abstract": x["abstract"],
                "keywords": x["keywords"],
                "pubdate": x["pubdate"],
                "mesh_terms": filter_mesh(x["mesh_terms"]),
                "delete": x["delete"]
            }, articles_filter))

    file_name = output_dir + "/pubmed_2019_{0:03}.p".format(proc_id)
    print("[Process-{}]: Store {}".format(proc_id, file_name))

    with open(file_name, "wb") as f:
        pickle.dump(articles_mapped, f)

    del articles
    print("[Process-{}] Ended".format(proc_id))
コード例 #20
0
    def parseXMLToDF(self):
        """
        Read XML files and parse them into a dataframe.

        Returns:
            Dataframe containing parsed papers
        
        """
        medline_files_rdd = self.SPARK_SESSION.sparkContext.parallelize(
            glob(self.xmlPath + "/*.xml"), self.numSlices)

        parse_results_rdd = medline_files_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                        for publication_dict in pp.parse_medline_xml(x)])

        medline_df = parse_results_rdd.toDF()
        return medline_df
コード例 #21
0
    def merge(self):
        print('PubMed path:', self.pubmed_path)

        with open(self.output_filename, mode='w', newline='\n') as ofile:

            # PubMed
            for filename in glob.glob(os.path.join(self.pubmed_path,
                                                   '**/*.xml'),
                                      recursive=self.recursive):
                print('file:', filename)
                dicts_out = pmp.parse_medline_xml(filename)

                self.write_dicts(dicts_out, 'abstract', ofile, 'title',
                                 'pubmed_abstract')

            # PMC
            for filename in glob.glob(os.path.join(self.pubmed_path,
                                                   '**/*.nxml'),
                                      recursive=self.recursive):
                print('file:', filename)

                # OA abstract
                try:
                    dicts_out = [pmp.parse_pubmed_xml(filename)]
                    self.write_dicts(dicts_out, 'abstract', ofile,
                                     'full_title', 'pmc_oa_abstract')
                except:
                    pass

                # OA image caption
                try:
                    dicts_out = pmp.parse_pubmed_caption(filename)
                    self.write_dicts(dicts_out, 'fig_caption', ofile,
                                     'fig_label', 'pmc_oa_image-caption')
                except:
                    pass

                # OA Paragraph
                try:
                    dicts_out = pmp.parse_pubmed_paragraph(filename,
                                                           all_paragraph=True)
                    self.write_dicts(dicts_out, 'text', ofile, 'reference_ids',
                                     'pmc_oa_paragraph')
                except:
                    pass
コード例 #22
0
def sents2txt(xml_in, output_file):
    analyze_out = pp.parse_medline_xml(xml_in)
    bcnt = 0

    print 'sents2txt ', xml_in

    sents = []

    for paper in analyze_out:
        pmid = paper['pmid']

        if paper['title'] is None:
            print '%s: title empty!' % pmid
            continue

        title = paper['title'].encode('utf-8').replace('\n', ' ')
        title = u2a_convert(pmid, title, 'title')

        abstract = ''
        if paper['abstract'] is not None:
            abstract = paper['abstract'].encode('utf-8').replace('\n', ' ')
            abstract = u2a_convert(pmid, abstract, 'abstract')
        else:
            print 'Cannot find abstract for PMID %s' % pmid

        txt_str = '%s %s' % (title, abstract)
        sub_sents = get_sentences_by_geniass(pmid, txt_str)

        sents += sub_sents

        if len(sents) > 1000:
            f_output = open(output_file, 'a')
            for sent in sents:
                f_output.write('%s\n' % sent)
            f_output.close()
            sents = []

        bcnt = bcnt + 1
        if bcnt % 100 == 0:
            print bcnt, 'medline records processed.'

    print 'Sentences written to file %s' % output_file
コード例 #23
0
    def merge(self):
        print('PubMed path:', self.pubmed_path)

        with open(self.output_filename, mode='w', newline='\n') as ofile:
            for filename in glob.glob(self.pubmed_path + '/*.xml',
                                      recursive=self.recursive):
                print('file:', filename)
                dicts_out = pmp.parse_medline_xml(filename)
                for dict_out in dicts_out:
                    if not dict_out['abstract']:
                        continue
                    try:
                        for line in dict_out['abstract'].splitlines():
                            if len(line) < 30:
                                continue
                            ofile.write(line.strip() + " ")
                        ofile.write("\n\n")
                    except:
                        ofile.write("\n\n")
                        continue
コード例 #24
0
def get_Pubtator_from_medline_xml(xml_in, pmids, txt_out):
    analyze_out = pp.parse_medline_xml(xml_in)
    pubtator_out = open(txt_out, 'w')
    cnt = 0

    for paper in analyze_out:
        cnt = cnt + 1
        if cnt % 1000 == 0:
            print cnt, "medline records processed"
        if paper['pmid'] not in pmids.keys():
            continue

        pubtator_out.write("%s|t|%s" %
                           (paper['pmid'], paper['title'].encode('utf-8')))
        pubtator_out.write('\n')
        pubtator_out.write("%s|a|%s" %
                           (paper['pmid'], paper['abstract'].encode('utf-8')))
        pubtator_out.write('\n')

    pubtator_out.close()
コード例 #25
0
def parse_single_doc(f):
    """parse single documents in medline"""
    # set file path
    file_name = "pubmed19n{:04d}.xml.gz".format(f)
    file_name = "../MEDLINE/" + file_name

    # dicts_out is a list of dictionay
    dicts_out = pp.parse_medline_xml(file_name,
                                     year_info_only=False,
                                     nlm_category=False,
                                     author_list=False,
                                     reference_list=False)

    # load abstracts that are non-empty
    texts = []
    for dict_ in dicts_out:
        abs_text = dict_['abstract']
        if len(abs_text) > 0:
            texts.append(abs_text.strip())

    return texts
コード例 #26
0
def build_df_and_save_file_from_meline_xml(filename):
    #print(f"loading {filename}...")
    output_pickle_filename = filename+'.pickle.xz'
    try: # try loading the file, and make sure it has at least five rows & an abstract
        df = pd.read_pickle(output_pickle_filename)
        len(df.loc[5,'abstract'])>10
        df.iloc[5]
        print(f"ALREADY PROCESSED, SKIPPPING\t{filename}...")
        return pd.DataFrame() # return  None makes Spark crash
    except: # if we can't load the processed pickle file, generate it from the xml
        pubmed_dict = pp.parse_medline_xml(filename) # dictionary output
        print(f"loaded {filename}\tcontains {len(pubmed_dict)} entries.")
        tmp_df = pd.DataFrame()
        tmp_df['year'] = [d['pubdate'] for d in pubmed_dict]
        tmp_df['abstract'] = [d['abstract'] for d in pubmed_dict]
        tmp_df['abstract'] = tmp_df['abstract'].str.lower()
        tmp_df['abstract_nchar'] = [len(t) for t in tmp_df['abstract'] ]
        tmp_df = tmp_df[tmp_df.abstract_nchar > 100] # remove abstracts that are too short
        tmp_df.reset_index(inplace=True, drop=True)
        tmp_df.to_pickle(output_pickle_filename,compression='xz')
        return tmp_df
コード例 #27
0
def medline2txt(xml_in, pmids, job_size):
        analyze_out = pp.parse_medline_xml(xml_in)
        cnt=0
        bcnt = 0

        print 'Medline2Txt', xml_in

        for paper in analyze_out:
                cnt = cnt + 1

                if cnt % 1000 == 0:
                        print cnt, "medline records processed"

                pmid = paper['pmid']
                sub_dir = 'input/%d' % (int(pmid) % job_size)

                if paper['pmid'] not in pmids.keys():
                        continue

                title = paper['title'].encode('utf-8').replace('\n', ' ')
                title = u2a_convert(pmid, title, "title")

                abstract = paper['abstract'].encode('utf-8').replace('\n', ' ')
                abstract = u2a_convert(pmid, abstract, "abstract")

                if not os.path.exists(sub_dir):
                        os.makedirs(sub_dir)
                f_tmp_in_fn = '%s/%s.txt' % (sub_dir,pmid)
                f_tmp_in = open(f_tmp_in_fn, 'w')
                #text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format
                text = '%s %s' % (title, abstract) # PWTEES FORMAT
		f_tmp_in.write(text)
                f_tmp_in.close()

                bcnt = bcnt + 1
                if bcnt % 100 == 0 :
                        print bcnt, 'cnv medline records inserted.'
コード例 #28
0
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(
        os.path.join(download_dir, 'medline*.xml.gz')),
                              numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(
        save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             compression='gzip')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'), '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      compression='gzip')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(
        save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           compression='gzip')
コード例 #29
0
            tempStr[:-1])  #append new string with boundaries
        index += 1
    return synTable


#print matches in sample dictionary to console
def found(pubmed_dict, synTable):

    count = 1  #for testing to see how many matches
    for article in pubmed_dict:  #iterate through articles
        for index in range(len(synTable)):  #iterate through each search string
            find = synTable[index][3]  #string with regex word boundaries

            text = article['abstract']  #abstract to search
            #find regex string, ignore case sensitivity
            regex = re.findall(find, text, re.IGNORECASE)
            if regex:  #if found
                print(article['pmid'] + '\t' + synTable[index][0])
                print(regex)
                print(count)
                count += 1


synFile = "synonyms.txt"
pubmed_dict = pp.parse_medline_xml('pubmedsample18n0001.xml')

synTable = createSynTable(synFile)  #create synonym table
synTable = addRegexString(synTable)  #add new regex string to table

found(pubmed_dict, synTable)
コード例 #30
0
ファイル: db_maker.py プロジェクト: Anwesh2/FAME
import pubmed_parser as pp
import pandas as pd
import pymysql
pubmed_data = pp.parse_medline_xml("medsample1.xml")
db = pymysql.connect(host="127.0.0.1",
                     user="******",
                     passwd="dehradun123",
                     db="pubmed")
curr = db.cursor()
print(pubmed_data[5].keys())
print(pubmed_data[5])
try:
    curr.execute("""create table pubmed_article (
			pmid varchar(100) primary key,
			pmc varchar(100),
			issn_linking varchar(100),
			pubdate varchar(100),
			nlm_id varchar(50),
			title text,
			deleted varchar(50),
			abstract text,
			affiliation varchar(1000),
			journal varchar(1000),
			medline_ta varchar(100),
			country varchar(500),
			other_id varchar(200)
		);
		""")

except Exception as e:
    print("article", e)
コード例 #31
0
def parse_abstracts(x):
    arr = []
    for publication_dict in pp.parse_medline_xml(x):
        if publication_dict['abstract'] != "":
            arr.append(Row(abstract=publication_dict['abstract']))
    return arr
コード例 #32
0
ファイル: db_maker.py プロジェクト: Anwesh2/FAME
import pubmed_parser as pp
import pandas as pd
import pymysql
pubmed_data=pp.parse_medline_xml("medsample1.xml")
db=pymysql.connect(host="127.0.0.1",user="******",passwd="dehradun123",db="pubmed")
curr=db.cursor()
print(pubmed_data[5].keys())
print(pubmed_data[5])
try:
	curr.execute("""create table pubmed_article (
			pmid varchar(100) primary key,
			pmc varchar(100),
			issn_linking varchar(100),
			pubdate varchar(100),
			nlm_id varchar(50),
			title text,
			deleted varchar(50),
			abstract text,
			affiliation varchar(1000),
			journal varchar(1000),
			medline_ta varchar(100),
			country varchar(500),
			other_id varchar(200)
		);
		""")

except Exception as e:
	print("article",e)

try:
	curr.execute("""create table authors (