def getMedlineList(pmids): """ This function takes a list of article-ids and returns a list of MedLine articles that contains an abstract. """ records = [] cleaned_records = [] listLength = len(pmids) Entrez.email = '*****@*****.**' for i in range(0, listLength, 650): tempList = pmids[i:i + 650] handle = Entrez.efetch(db='pubmed', id=tempList,rettype='medline', retmode='text') try: records.extend(list(Medline.parse(handle))) except: IOmodule.writeOutTxt(_mainFolder+'/'+'errordir_medline_records', pmids[i], '') print 'Downloaded',len(records),'MedLine articles.',str(listLength-len(records)),'remaining...' for article in records: if 'AB' in article: cleaned_records.append(article) print 'Returned',len(cleaned_records),'MedLine articles containing an abstract.' return cleaned_records
def gatherOfAllThings(startIndex=0,stopIndex=None): # Get the number of diseases, based on start and stop. If # stopIndex = None, then it returns the whole range numberOfRareDiseases = len(_readDiseases(startIndex,stopIndex)) # Default number per chuck, before writeout numberToGet = 1 # Calculate the numbers of steps, for looping. steps = int(math.ceil(numberOfRareDiseases / numberToGet)) # Default directory to save information files in. directory = 'medline_records' _path_medlinerecords+=_mainFolder+'/'+directory if not os.path.isDir(_path_medlinerecords): os.mkdir(_path_medlinerecords) # Read in the range of diseases we want to get information about, # in a list, it needs to be sorted to support resume. d=_readDiseases(startIndex,stopIndex) for i in range(steps): # Read in the a chuck of diseases in a list diseaseList = d[i * numberToGet:i * numberToGet + numberToGet] diseaseDictionary = {} for i in range(len(diseaseList)): # Transfer the ordered disease list into an unordered # dictionary diseaseDictionary[diseaseList[i].keys()[0]] = diseaseList[i].values()[0] dictionary = {} # Runs through the disease dictionary and gets all the PMIDs # for each disease diseaseDictionary = getArticleIDs(diseaseDictionary) for disease in diseaseDictionary: dictionary[disease] = {} dictionary[disease]['records']=[] dictionary[disease]['description'] = diseaseDictionary[disease]['description'] dictionary[disease]['records'].extend(getMedlineList(diseaseDictionary[disease]['PMIDs'])) IOmodule.writeOutTxt(_path_medlinerecords, disease, dictionary[disease], 'w')
def fetchPubmedDiseaseTerms(pages): """ Takes a URL-list of pages to crawl for pubmed terms, uids and optional describtions. Returns a dictionary on the form: {DiseaseName:{db='',terms:'',syn=[],uid:'',desc:''}} """ pubmedURLs={} printvar=0 pagenumber=0 desccounter=0 for page in pages: pagenumber+=1 # Open the page. for i in range(3): try: c=urllib2.urlopen(page) except: print "Could not open %s" % page print "Attempt",str(i+1),"out of 3" sleep(5) if i==2: print "Could not open page. Terminating.." raise StopIteration() try: soup=BeautifulSoup(c.read()) except HTMLParseError: print 'Experienced difficulties opening %s' % page IOmodule.writeOutTxt(_subFolder+"/"+diseaseFolder+'/'+errorFolder,strftime('%H%M%S'),page) continue # Get disease name. title=soup.html.head.title.string # Some pages are 'officially' not working. Catch them here. if title=='NIH Office of Rare Diseases Research (ORDR) - Error': IOmodule.writeOutTxt(_subFolder+"/"+diseaseFolder+'/'+errorFolder,'Page error'+strftime('%H%M%S'),page) print 'Page Error on %s' % page continue # Allocate dictionary. pubmedURLs[title]={} pubmedURLs[title]['db']='pubmed' # ..database to search in (pubmed by default) pubmedURLs[title]['terms']='' # ..handcrafted search term pubmedURLs[title]['syn']=[] # ..disease synonyms pubmedURLs[title]['uid']='' # ..search id pubmedURLs[title]['desc']='' # ..optional disease description # Check for PubMed direct links. links=soup('a') found=False for link in links: if (link.contents): if ((link.contents[0] == 'PubMed')) & ('href' in dict(link.attrs)): urlString = link['href'].lower() # If there is a PubMed direct link and it's an id: if ('uid=' in urlString) | ('uids=' in urlString) | ('idsfromresult=' in urlString): tokens = _parseURL(urlString) uid = tokens['from_uid'] pubmedURLs[title]['uid'] = uid pubmedURLs[title]['db'] = tokens['db'] printvar += 1 found = True print 'Found', str(printvar), 'PubMed terms/uids.', title continue # If there is a PubMed direct link and it's a handcrafted term: elif ('term=' in urlString): tokens = _parseURL(urlString) terms = tokens['term'] pubmedURLs[title]['terms'] = terms pubmedURLs[title]['db'] = tokens['db'] printvar += 1 found = True print 'Found', str(printvar), 'PubMed terms/uids.', title continue # Special case 1: If there is a PubMed direct link but the uid is not part of the tokens. elif ('/entrez/' not in urlString): start = urlString.find('/pubmed/') + 8 if '?' in urlString: end = urlString.find('?') uid = urlString[start:end] else: uid = urlString[start:] print uid pubmedURLs[title]['uid'] = uid printvar += 1 found = True print 'Found', str(printvar), 'PubMed terms/uids.', title, '. (Special case 1: No tokens)' # Special case 2: If there is a webenv, the url is (by experience) not working but the disease name is still valuable for a pbumed search. elif '&webenv=' in urlString: printvar += 1 found = True print 'Found', str(printvar), 'PubMed terms/uids.', title, '. (Special case 2: WebEnv)' # Terminate if an unexpected url shows up if link.contents: if (not found) & (link.contents[0]=='PubMed'): print 'Could not fetch url' raise StopIteration() # A simple addition to the printouts. if not found: printvar+=1 print 'Found',str(printvar),'Diseases.',title,'(no uid or term).' # Notify if an unexpected database shows up. if ((pubmedURLs[title]['db']!='')&(pubmedURLs[title]['db']!='omim')&(pubmedURLs[title]['db']!='pubmed')): print "*****Found different db:",pubmedURLs[title]['db'] print 'Could not fetch url' raise StopIteration() # Disease synonyms are also added to the term list. lis=soup('li') for li in lis: if ('synonym' in str(li.parent)): synonym=li.contents[0] if (',') in str(synonym): aditionalSynonyms=synonym.split(',') for syn in aditionalSynonyms: pubmedURLs[title]['syn'].append(syn) print ' ' + syn else: pubmedURLs[title]['syn'].append(synonym) print ' ' + synonym # Look for a optional disease description on rarediseases.info.nih.gov. descs=soup('span') for desc in descs: if ('id' in dict(desc.attrs)): idString=desc['id'].lower() if (('descriptionquestion' in idString) & ('#003366' not in str(desc))): desc=_cleanString(str(desc)) pubmedURLs[title]['desc']=desc desccounter+=1 print ' *Found optional disease description' print '' if ((pagenumber%20)==0): # Print status report print '*****************************************************' print 'Total pages looked in:',str(pagenumber),'\nPages found:',str(printvar),'\nRemaining in total:',(len(pages)-printvar),'out of',len(pages),'\nDescriptions found:',str(desccounter) print '*****************************************************' print 'Writing to files...' # Write out and flush dictionary for disease in pubmedURLs: # Remove some problematic tokens from the file name content=pubmedURLs[disease] disease=removeSlashes.sub(' ',disease) disease=removeCommas.sub(' ',disease) # Write out IOmodule.writeOutTxt(diseaseFolder,disease,content) pubmedURLs={} print 'Wrote successfully. Dictionary flushed.'