예제 #1
0
def getMedlineList(pmids):

    """
    This function takes a list of article-ids and returns a list of
    MedLine articles that contains an abstract.
    """

    records = []
    cleaned_records = []
    listLength = len(pmids)

    Entrez.email = '*****@*****.**'

    for i in range(0, listLength, 650):
        tempList = pmids[i:i + 650]
        handle = Entrez.efetch(db='pubmed', id=tempList,rettype='medline', retmode='text')
        try:
            records.extend(list(Medline.parse(handle)))
        except:
            IOmodule.writeOutTxt(_mainFolder+'/'+'errordir_medline_records', pmids[i], '')

        print 'Downloaded',len(records),'MedLine articles.',str(listLength-len(records)),'remaining...'

    for article in records:
        if 'AB' in article:
            cleaned_records.append(article)
    
    print 'Returned',len(cleaned_records),'MedLine articles containing an abstract.'
    return cleaned_records
예제 #2
0
def gatherOfAllThings(startIndex=0,stopIndex=None):

    # Get the number of diseases, based on start and stop. If
    # stopIndex = None, then it returns the whole range
    numberOfRareDiseases = len(_readDiseases(startIndex,stopIndex))
    # Default number per chuck, before writeout
    numberToGet = 1
    # Calculate the numbers of steps, for looping.
    steps = int(math.ceil(numberOfRareDiseases / numberToGet))

    # Default directory to save information files in.
    directory = 'medline_records'
    _path_medlinerecords+=_mainFolder+'/'+directory
    if not os.path.isDir(_path_medlinerecords):
        os.mkdir(_path_medlinerecords)

    # Read in the range of diseases we want to get information about,
    # in a list, it needs to be sorted to support resume.
    d=_readDiseases(startIndex,stopIndex)

    for i in range(steps):
        # Read in the a chuck of diseases in a list
        diseaseList = d[i * numberToGet:i * numberToGet + numberToGet]

        diseaseDictionary = {}

        for i in range(len(diseaseList)):
            # Transfer the ordered disease list into an unordered
            # dictionary
            diseaseDictionary[diseaseList[i].keys()[0]] = diseaseList[i].values()[0]

        dictionary = {}
        # Runs through the disease dictionary and gets all the PMIDs
        # for each disease
        diseaseDictionary = getArticleIDs(diseaseDictionary)

        for disease in diseaseDictionary:

            dictionary[disease] = {}
            dictionary[disease]['records']=[]
            dictionary[disease]['description'] = diseaseDictionary[disease]['description']

            dictionary[disease]['records'].extend(getMedlineList(diseaseDictionary[disease]['PMIDs']))

            IOmodule.writeOutTxt(_path_medlinerecords, disease, dictionary[disease], 'w')
예제 #3
0
def fetchPubmedDiseaseTerms(pages):

    """
    Takes a URL-list of pages to crawl for pubmed terms, uids and optional
    describtions.

    Returns a dictionary on the form:
    {DiseaseName:{db='',terms:'',syn=[],uid:'',desc:''}}
    """

    pubmedURLs={}

    printvar=0
    pagenumber=0
    desccounter=0
    for page in pages:
        pagenumber+=1
        
        # Open the page.
        for i in range(3):
            try:
                c=urllib2.urlopen(page)
            except:
                print "Could not open %s" % page
                print "Attempt",str(i+1),"out of 3"
                sleep(5)
                if i==2:
                    print "Could not open page. Terminating.."
                    raise StopIteration()

        try:
            soup=BeautifulSoup(c.read())
        except HTMLParseError:
            print 'Experienced difficulties opening %s' % page
            IOmodule.writeOutTxt(_subFolder+"/"+diseaseFolder+'/'+errorFolder,strftime('%H%M%S'),page)
            continue

        # Get disease name.
        title=soup.html.head.title.string

        # Some pages are 'officially' not working. Catch them here.
        if title=='NIH Office of Rare Diseases Research (ORDR) - Error':
            IOmodule.writeOutTxt(_subFolder+"/"+diseaseFolder+'/'+errorFolder,'Page error'+strftime('%H%M%S'),page)
            print 'Page Error on %s' % page
            continue

        # Allocate dictionary.
        pubmedURLs[title]={}
        pubmedURLs[title]['db']='pubmed'    # ..database to search in (pubmed by default)
        pubmedURLs[title]['terms']=''       # ..handcrafted search term
        pubmedURLs[title]['syn']=[]         # ..disease synonyms
        pubmedURLs[title]['uid']=''         # ..search id
        pubmedURLs[title]['desc']=''        # ..optional disease description

        # Check for PubMed direct links.
        links=soup('a')
        found=False
        for link in links:
            if (link.contents):
                if ((link.contents[0] == 'PubMed')) & ('href' in dict(link.attrs)):
                    urlString = link['href'].lower()
                    # If there is a PubMed direct link and it's an id:
                    if ('uid=' in urlString) | ('uids=' in urlString) | ('idsfromresult=' in urlString):
                        tokens = _parseURL(urlString)
                        uid = tokens['from_uid']
                        pubmedURLs[title]['uid'] = uid
                        pubmedURLs[title]['db'] = tokens['db']
                        printvar += 1
                        found = True
                        print 'Found', str(printvar), 'PubMed terms/uids.', title
                        continue
                    # If there is a PubMed direct link and it's a handcrafted term:
                    elif ('term=' in urlString):
                        tokens = _parseURL(urlString)
                        terms = tokens['term']
                        pubmedURLs[title]['terms'] = terms
                        pubmedURLs[title]['db'] = tokens['db']
                        printvar += 1
                        found = True
                        print 'Found', str(printvar), 'PubMed terms/uids.', title
                        continue
                    # Special case 1: If there is a PubMed direct link but the uid is not part of the tokens.
                    elif ('/entrez/' not in urlString):
                        start = urlString.find('/pubmed/') + 8
                        if '?' in urlString:
                            end = urlString.find('?')
                            uid = urlString[start:end]
                        else:
                            uid = urlString[start:]
                        print uid
                        pubmedURLs[title]['uid'] = uid
                        printvar += 1
                        found = True
                        print 'Found', str(printvar), 'PubMed terms/uids.', title, '. (Special case 1: No tokens)'
                    # Special case 2: If there is a webenv, the url is (by experience) not working but the disease name is still valuable for a pbumed search.
                    elif '&webenv=' in urlString:
                        printvar += 1
                        found = True
                        print 'Found', str(printvar), 'PubMed terms/uids.', title, '. (Special case 2: WebEnv)'

                # Terminate if an unexpected url shows up
                if link.contents:
                    if (not found) & (link.contents[0]=='PubMed'):
                        print 'Could not fetch url'
                        raise StopIteration()

        # A simple addition to the printouts.
        if not found:
            printvar+=1
            print 'Found',str(printvar),'Diseases.',title,'(no uid or term).'

        # Notify if an unexpected database shows up.
        if ((pubmedURLs[title]['db']!='')&(pubmedURLs[title]['db']!='omim')&(pubmedURLs[title]['db']!='pubmed')):
            print "*****Found different db:",pubmedURLs[title]['db']
            print 'Could not fetch url'
            raise StopIteration()

        # Disease synonyms are also added to the term list.
        lis=soup('li')
        for li in lis:
            if ('synonym' in str(li.parent)):
                synonym=li.contents[0]
                if (',') in str(synonym):
                    aditionalSynonyms=synonym.split(',')
                    for syn in aditionalSynonyms:
                        pubmedURLs[title]['syn'].append(syn)
                        print '  ' + syn
                else:
                    pubmedURLs[title]['syn'].append(synonym)
                    print '  ' + synonym

        # Look for a optional disease description on rarediseases.info.nih.gov.
        descs=soup('span')
        for desc in descs:
            if ('id' in dict(desc.attrs)):
                idString=desc['id'].lower()
                if (('descriptionquestion' in idString) & ('#003366' not in str(desc))):
                    desc=_cleanString(str(desc))
                    pubmedURLs[title]['desc']=desc
                    desccounter+=1
                    print '    *Found optional disease description'
        print ''
        
        if ((pagenumber%20)==0):
            # Print status report
            print '*****************************************************'
            print 'Total pages looked in:',str(pagenumber),'\nPages found:',str(printvar),'\nRemaining in total:',(len(pages)-printvar),'out of',len(pages),'\nDescriptions found:',str(desccounter)
            print '*****************************************************'
            print 'Writing to files...'
            # Write out and flush dictionary
            for disease in pubmedURLs:
                # Remove some problematic tokens from the file name
                content=pubmedURLs[disease]
                disease=removeSlashes.sub(' ',disease)
                disease=removeCommas.sub(' ',disease)
                # Write out
                IOmodule.writeOutTxt(diseaseFolder,disease,content)
            pubmedURLs={}
            print 'Wrote successfully. Dictionary flushed.'