Esempio n. 1
0
def getMedlineList(pmids):

    """
    This function takes a list of article-ids and returns a list of
    MedLine articles that contains an abstract.
    """

    records = []
    cleaned_records = []
    listLength = len(pmids)

    Entrez.email = '*****@*****.**'

    for i in range(0, listLength, 650):
        tempList = pmids[i:i + 650]
        handle = Entrez.efetch(db='pubmed', id=tempList,rettype='medline', retmode='text')
        try:
            records.extend(list(Medline.parse(handle)))
        except:
            IOmodule.writeOutTxt(_mainFolder+'/'+'errordir_medline_records', pmids[i], '')

        print 'Downloaded',len(records),'MedLine articles.',str(listLength-len(records)),'remaining...'

    for article in records:
        if 'AB' in article:
            cleaned_records.append(article)
    
    print 'Returned',len(cleaned_records),'MedLine articles containing an abstract.'
    return cleaned_records
Esempio n. 2
0
def runTFIDF():

    """
    Create a normalized log-transformed TFIDF-matrix from a sparse coo_matrix.
    """

    files = IOmodule.getSortedFilelist(_matrixDir + "/")

    #    files = sorted([f for f in os.listdir(_matrixDir+"/") if os.path.isfile(_matrixDir+"/" + f)])

    for file in files:

        file = file[:-4]

        subM_coo = IOmodule.readInTDM(_matrixDir, file)

        t1 = time.time()
        dense, lil = _generateLogTFIDF(subM_coo)
        t2 = time.time()
        print "Generated log_TFIDF in " + str(t2 - t1)

        t1 = time.time()
        _normalizeVectorLengths(dense, lil, file)
        t2 = time.time()
        print "Normalized vector lengths in " + str(t2 - t1)

        print "Done with: " + file + "\n"
Esempio n. 3
0
def createVLHash(M_lil):

    VLHash={}
    for pmidHash in range(M_lil.shape[0]):
        VLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:])

    IOmodule.pickleOut("/root/The_Hive/term_doc/hashTables", "VLHash","btd", VLHash)
Esempio n. 4
0
def createRLHash(M_lil,filename,save_file=True):

    """
    Precompute and save the norm of each row vector in the term-doc matrix.
    """

    t1=time.time()

    if not os.path.isdir(_hashTablesDir):
        os.mkdir(_hashTablesDir)

    RLHash={}
    count=0
    for pmidHash in range(1,M_lil.shape[0]):
        RLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:])
        count+=1
        if save_file: print "Hashes created: "+str(count)

    if save_file:
        IOmodule.pickleOut(_hashTablesDir, filename,"btd", RLHash)
    else:
        return RLHash

    t2=time.time()
    print "Created and saved RowLength-hash in: "+str(t2-t1)
Esempio n. 5
0
def createDiseaseLabelHash():

    """
    Create and save a hash that connects every PMID with one or more diseases.
    """

    t1 = time.time()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

    labelHash={}

    fileCount=0
    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        colMatrix=subMatrix.tocsc()

        pmids=colMatrix.getcol(0)[1:].data

        for pmid in pmids:
            try:
                labelHash[pmid].append(file[:-4])
            except:
                labelHash[pmid]=[]
                labelHash[pmid].append(file[:-4])
            
        fileCount+=1
        print "Remaining:",(len(files)-fileCount),"Completed",file[:-4]

    t2 = time.time()

    print 'Created disease label hash in:',str(t2-t1)

    IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)
Esempio n. 6
0
def _normalizeVectorLengths(M_lil):

    """
    Normalize the length of a sparse lil_matrix.
    """

    t1=time.time()

    # Create a norm-hash of each row-vector in the stemmed term-doc matrix.
    vectorLength=SearchTermDoc.createRLHash(M_lil, _RLHash,False)

    for row in range(1,M_lil.shape[0]):

        norm=vectorLength[row]
        for col in (M_lil.getrow(row).nonzero()[1])[1:]:
            M_lil[row,col]=(M_lil[row,col])/norm
        print "Normalized:",row
    t2=time.time()
    print "Total:"+str(t2-t1)

    # This is madness
    tfidfMatrix = M_lil

    # Save and overwrite the log_tfidf generated above
    IOmodule.writeOutTDM(_termDocDir, _tfidfName+'_norm', tfidfMatrix)
Esempio n. 7
0
def runOutlierDetector(dir, distance=cosine_dense, removePercent=0.05, output=False, time_log=False):

    files = IO.getSortedFilelist(dir+'/')

    if output:
        counter = 0

    for f in files:
        diseaseName = f[0:f.find('.mtx')]
        subTermDoc = IO.readInTDM(dir, diseaseName)
        if output:
            counter += 1
            print 'Count:', counter

        # If sub term document matrix is empty, just skip it.
        if subTermDoc.shape[0]==1 or subTermDoc.shape[1]==1:
            continue

        if time_log:
            t1 = time.time()
        subTermDoc = outlierDetector(subTermDoc, distance, removePercent, output, time_log)
        if time_log:
            print 'Time for outlier detection on', diseaseName, ':', str(time.time() -t1)[:4]

        if output:
            print 'Writing',

        subTermDoc = sparse.coo_matrix(subTermDoc)

        IO.writeOutTDM(_subFolder+'/'+outlierRemoved+str(int(removePercent*100)), diseaseName, subTermDoc)
Esempio n. 8
0
def createCLHash(M_coo,filename,save_file=True):

    """
    Precompute and save the length of each column vector in the term-doc matrix.
    Here the length refers to the number of elements.
    """

    t1=time.time()

    if not os.path.isdir(_hashTablesDir):
        os.mkdir(_hashTablesDir)

    M_lil=(M_coo.transpose()).tolil()

    CLHash={}
    count=0
    for termHash in range(1,M_coo.shape[1]):
        termVectorLength=len((M_lil.getrow(termHash).nonzero()[0])[1:])
        CLHash[termHash]=termVectorLength
        count+=1
        if save_file: print "Hashes created: "+str(count)+". Length:"+str(termVectorLength)

    t2=time.time()    

    if save_file:
        print "Created and saved ColumnLength-hash in: "+str(t2-t1)
        IOmodule.pickleOut(_hashTablesDir, filename,"btd", CLHash)
    else:
        return CLHash
Esempio n. 9
0
def saveMatrix(filename,matrix):

    t1 = time.time()

    IOmodule.writeOutTDM('testFolder',filename, matrix)

    t2 = time.time()
    print 'Time used: ',(t2-t1)
Esempio n. 10
0
def _generateLogTFIDF(M_coo):

    """
    Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix,
    using log-transformation on TF and IDF.

    Returns a sparse lil_matrix to be used for vector-normalization.
    """

    totalTime1=time.time()

    numberOfDocs = float(M_coo.shape[0]-1)

    print "Converting from coo to lil..."
    t1=time.time()
    tfidfMatrix=M_coo.tolil()
    t2=time.time()
    print "Matrix converted to lil in",(t2-t1)

    t1=time.time()

    for row in range(1,numberOfDocs+1):
        
        for col in (tfidfMatrix.getrow(row).nonzero()[1])[1:]:
            
            tf=tfidfMatrix[row,col]

            #if tf == 0:
            #    print "Looked up zero-value at: "+str(docIndex)+" "+str(termVectorIndex)
            #    raise Exception
            if tf <=0:
                print tf
                tf=0.000000000000001 # <---for svd


            try:
                tf = math.log(1 + tf)
            except:
                print tf

            if _termSum[col]==0: continue
            idf = math.log(numberOfDocs / _termSum[col])
            
            tfidfMatrix[row,col]=tf*idf
        
        print "Row:",row
        
    t2=time.time()
    print "Total:"+str(t2-t1)

    # Save and overwrite the log_tfidf generate above
    IOmodule.writeOutTDM(_termDocDir, _tfidfName, tfidfMatrix)

    totalTime2=time.time()
    print "Total time: "+str(totalTime2-totalTime1)

    return tfidfMatrix
Esempio n. 11
0
def pmidDuplicateCounter(directory, number=None):

        # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)])[:number]
        files = IOmodule.getSortedFilelist(directory+'/')
        
        pmidCount={}
        counter=0
        for f in files:
            # Open file descriptor
            fd = open(directory+'/'+f,'r')
            # Read in from file
            diseaseDic=eval(fd.read())
            # Close the file descriptor nicely again
            fd.close()
            
            medlineRecords=diseaseDic['records']

            for record in medlineRecords:
                pmidCount.setdefault(record['PMID'],0)
                pmidCount[record['PMID']]+=1

            counter+=1
            if counter % 1000 == 0:
                print counter
#            print "Files remaining:",(len(files)-counter)
                
        return pmidCount
Esempio n. 12
0
def _readDiseases(indexStart=0,indexStop=None):

    """
    Function for returning the content of all or some of the crawled diseases.

    By default all are returned in a dictionary of diseases on the form:
    [{DiseaseName:{db='',terms:'',syn=[],uid:'',desc:''}}]

    The reason all the dictionaries are returned as a list is to be sure of the
    order.
    """

    path=_subFolder+"/"+diseaseFolder+'/'

    #    files=sorted([f for f in os.listdir(path) if os.path.isfile(path+f)])
    files = IOmodule.getSortedFilelist(path, startIndex=indexStart, stopIndex=indexStop)

    sortedcontents=[]
    for f in files:
        contents={}
        diseaseName=f[0:f.find('.txt')]
        diseaseAttr=eval(open(path+f,'r').read())
        contents[diseaseName]=diseaseAttr
        sortedcontents.append(contents)

    return sortedcontents
Esempio n. 13
0
def countRecordfield(directory,field):

    """
    This function counts the number of identical fields in the MedLine records.

    This could for instance be used for a view into how many identical PMIDs
    that have been downloaded on the cross of different disease searches.

    It takes a medline record directory (full path) and a field.

    It returns a dictionary on the form: {PMID: #ids,...}
    """
    
    fieldSum={}
#    files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)])
    files = IOmodule.getSortedFilelist(directory+'/')

    counter=0
    for f in files:

        diseaseDic=eval(open(directory+f,'r').read())

        medlineRecords=diseaseDic['records']

        for record in medlineRecords:
            item=record[field]
            fieldSum.setdefault(item,0)
            fieldSum[item]+=1

        counter+=1
        print "Files remaining:",(len(files)-counter)

    return fieldSum
Esempio n. 14
0
def gatherOfAllThings(startIndex=0,stopIndex=None):

    # Get the number of diseases, based on start and stop. If
    # stopIndex = None, then it returns the whole range
    numberOfRareDiseases = len(_readDiseases(startIndex,stopIndex))
    # Default number per chuck, before writeout
    numberToGet = 1
    # Calculate the numbers of steps, for looping.
    steps = int(math.ceil(numberOfRareDiseases / numberToGet))

    # Default directory to save information files in.
    directory = 'medline_records'
    _path_medlinerecords+=_mainFolder+'/'+directory
    if not os.path.isDir(_path_medlinerecords):
        os.mkdir(_path_medlinerecords)

    # Read in the range of diseases we want to get information about,
    # in a list, it needs to be sorted to support resume.
    d=_readDiseases(startIndex,stopIndex)

    for i in range(steps):
        # Read in the a chuck of diseases in a list
        diseaseList = d[i * numberToGet:i * numberToGet + numberToGet]

        diseaseDictionary = {}

        for i in range(len(diseaseList)):
            # Transfer the ordered disease list into an unordered
            # dictionary
            diseaseDictionary[diseaseList[i].keys()[0]] = diseaseList[i].values()[0]

        dictionary = {}
        # Runs through the disease dictionary and gets all the PMIDs
        # for each disease
        diseaseDictionary = getArticleIDs(diseaseDictionary)

        for disease in diseaseDictionary:

            dictionary[disease] = {}
            dictionary[disease]['records']=[]
            dictionary[disease]['description'] = diseaseDictionary[disease]['description']

            dictionary[disease]['records'].extend(getMedlineList(diseaseDictionary[disease]['PMIDs']))

            IOmodule.writeOutTxt(_path_medlinerecords, disease, dictionary[disease], 'w')
Esempio n. 15
0
def _normalizeVectorLengths(M_dense, M_lil, filename):

    """
    Normalize the length of a sparse matrix, represented as a dense and a lil -
    format.
    """

    vectorLength = SearchTermDoc.createRLHash(M_lil, None, False)

    for row in range(1, M_lil.shape[0]):

        norm = vectorLength[row]
        for col in (M_lil.getrow(row).nonzero()[1])[1:]:
            M_dense[row, col] = (M_dense[row, col]) / norm

    tfidfMatrix = sparse.coo_matrix(M_dense)

    # Save the matrix
    IOmodule.writeOutTDM(_termDocDir, filename, tfidfMatrix)
Esempio n. 16
0
def loadMatrix(dirPath,filename):

    t1=time.time()

    M = IOmodule.readInTDM(dirPath, filename)

    t2=time.time()

    print str(t2-t1)

    return M
Esempio n. 17
0
def calculate_model_with_setted_parameter(t_input):
    s_address_file_model, s_address_file_drug, s_address_file_inputnode, s_address_folder_output, i_iteration, i_length, i_count_starting, i_resolution_drug, l_s_targets = t_input
    obj_iterator = Iterator_of_model_simulator(
        update_function,  #imported module
        s_address_file_model,
        s_address_file_drug,
        s_address_file_inputnode,
        i_iteration,
        i_length,
        i_count_starting,
        l_s_targets,
        i_resolution_drug)
    obj_iterator.calculate_model_for_each_drug_concentrations()

    s_file_drug = os.path.split(s_address_file_drug)[1]
    s_drug = os.path.splitext(s_file_drug)[0]
    s_address_folder_output = os.path.join(s_address_folder_output, s_drug)
    IOmodule.make_forder_for_output(s_address_folder_output)
    IOmodule.save_output_as_txtfile(obj_iterator, s_address_folder_output)
    return s_address_file_model
Esempio n. 18
0
def medlineDir2MatrixDir():

    """
    This function converts a directory of MedLine records to a new directory of
    corresponding term-doc matrices.

    It takes the matrix dimensions (row: m, col: n).

    It creates a directory (in the home folder) named 'diseaseMatrices' and
    stores the matrices as 'MatrixMarket' .mtx files, named by the disease name.
    """

    termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash)
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash)

    files = IOmodule.getSortedFilelist(_medlineDir+'/')

#    files = sorted([f for f in os.listdir(_medlineDir+"/") if os.path.isfile(_medlineDir+"/" + f)])

    counter = 0
    for file in files:
        data = _gatherMatrixData(file)

        # Get matrix dimensions (+1 for the [0,0] field)
        ## (Here follows a small 0.0001 sec. hack to get n = total number of terms)
        temp={}
        for pmid in data:
            for term in pmid[1]:
                temp[term[0]]=0
        m=len(data)+1
        n=len(temp)+1

        M = _populateMatrix(m, n, data,termHashTable, pmidHashTable)
        diseaseName = file[0:file.find('.txt')]
        IOmodule.writeOutTDM(_subMatrixDir, diseaseName, M)
        counter += 1
        print str(counter),"matrices made. Total number of terms:",len(M.getrow(0).nonzero()[0])
Esempio n. 19
0
def createDiseaseHash(dir,output=False):

    """
    Recieves a directory containing files to be hashed. It uses the
    filename as a key. It requires the files to be in .mtx format. The
    hashes starts from 1 ... number_of_files
    """

    diseaseHashes={}

    files = IO.getSortedFilelist(dir)
    counter=0
    for f in files:
        diseaseName=f[0:f.find('.mtx')]
        stdm=IO.readInTDM(dir,f)
        if stdm.shape[0]==1:
            continue
        if diseaseName not in diseaseHashes.keys():
            counter+=1
            if output:
                print 'Created', diseaseName, 'with hash', counter
            diseaseHashes[diseaseName]=counter

    IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)
Esempio n. 20
0
def createTermDoc(refreshHash=False):

    """
    This function creates a large term-doc martix from a directory of sub term-
    doc matrices.

    It returns a matrix with dimensions given by the specified hash tables.

    It also saves the matrix for later use as a MatrixMarket .mtx file.
    """

    t1 = time.time()

    if refreshHash:
        createTermAndPmidHashes()

    files = IOmodule.getSortedFilelist(_subMatrixDir+'/')

#    files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)])
    
    termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash)
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash)


    # Need to add one due to non zero indexing
    m=len(pmidHashTable)+1
    n=len(termHashTable)+1

    termDoc = sparse.lil_matrix((m,n))

    # Insert values representing hashes
    for i in range(m): termDoc[i,0]=i
    termDoc[0,:]=range(n)

    for file in files:
        subMatrix=IOmodule.readInTDM(_subMatrixDir, file)
        subMCopy=subMatrix.todok()
        for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data):
            m = subMCopy[i,0]
            n = subMCopy[0,j]

            # Make sure not to add index's
            if m==0 or n==0:
                continue

            termDoc[m,n] += v
        print "Added",file

    IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc)

    t2 = time.time()

    print 'Time elapsed:',str(t2-t1)

    return termDoc
Esempio n. 21
0
def getSemanticKeywords(top=20):

    diseaseList=[("Fibrodysplasia ossificans progressiva","Boy, normal birth, deformity of both big toes (missing joint), quick development of bone tumor near spine and osteogenesis at biopsy"),
                ("Adrenoleukodystrophy  autosomal  neonatal form","Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions"),
                ("Papillon Lefevre syndrome","Boy age 14, yellow keratotic plaques on the skin of palms and soles going up onto the dorsal side. Both hands and feet are affected. swollen vulnerable gums, loss of permanent teeth"),
                ("Kleine Levin Syndrome","Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"),
                ("Schinzel Giedion syndrome","Male child, malformations at birth, midfacial retraction with a deep groove under the eyes, and hypertelorism, short nose with a low nasal bridge and large low-set ears, wide mouth and retrognathia. Hypertrichosis with bright reddish hair and a median frontal cutaneous angioma, short neck with redundant skin, Bilateral inguinal hernias, hypospadias with a megameatus, and cryptorchidism")]

    matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_90"
    #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_10"
    #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_tfidf_stemmed_reduced90_outlierRemoved5"

    scoreDic={}
    totalTermDic={}
    for disease in diseaseList:

        filename=disease[0]
        symptoms=disease[1]

        symptoms=SearchTermDoc._modifySearchString(symptoms)
        symptoms=map(FilterInterface.porterStemmer,symptoms)

        M_coo=IOmodule.readInTDM(matrixDir,filename)
        totalTermDic[filename]=(M_coo.shape[1]-1)

        M_csc=M_coo.tocsc()
    
        termSum=[]
        for col in range(1,M_coo.shape[1]):
            term=revTermHashTable[M_csc[0,col]]
            termSum.append((sum(M_csc.getcol(col).data[:-1]),term))

        termSum.sort()
        termSum.reverse()

        scoreDic[filename]={}
        for item in termSum:
            if item[1] in symptoms:
                scoreDic[filename][item[1]]=termSum.index(item)

    #return termSum[:top]

    for score in scoreDic.items():
        print "Total number of terms for the disease:",totalTermDic[score[0]]
        print str(score[0])+'\t'+str(score[1].items())
        print ''
Esempio n. 22
0
def countFields(directory, fields):

        # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)])

        files = IOmodule.getSortedFilelist(directory+'/')
        
        fieldSum={}
        counter=0
        pmidCounter=0
        emptyCounter=0
        noDescription=0

        for f in files:
            
            fd = open(directory+f,'r')
            
            diseaseDic=eval(fd.read())

            fd.close()
            
            medlineRecords=diseaseDic['records']

            descriptionField=diseaseDic['description']

            if descriptionField== '':
                noDescription+=1

            if medlineRecords == []:
                print "Found empty record"
                emptyCounter+=1

            for record in medlineRecords:
                pmidCounter+=1
                for label in fields:
                    fieldSum.setdefault(label,0)
                    if label in record:
                        fieldSum[label]+=1

            counter+=1
            print "Files remaining:",(len(files)-counter)
                
        return fieldSum,{'pmid count': pmidCounter},{'empty diseases': emptyCounter}
Esempio n. 23
0
def main():
    """make models by txt files in model list folder, and get drug information from drug_probability.txt
    input the number of iteration(calculation of one trajectory) and the length of trajectory.
    input the resolution of drug concentration change (if 10 is inputed, 0,0.1,0.2,,, 0.9,1 cases are calculated)
    finally, input the node names to observe.
    then this function will make text files of observed node pattern for each model in output folder"""

    s_address_NSC = os.path.dirname(os.path.realpath(__file__))
    l_parameters = IOmodule.set_config(s_address_NSC)

    p = Pool(l_parameters[6])
    l_list_mutation_files = [
        os.path.join(l_parameters[0], s_filename_mutation)
        for s_filename_mutation in os.listdir(l_parameters[0])
    ]
    l_list_drug_files = [
        os.path.join(l_parameters[3], s_drug_file)
        for s_drug_file in os.listdir(l_parameters[3])
    ]

    s_address_file_inputnode = os.path.join(s_address_NSC, "input_nodes.txt")
    l_list_inputs = [
        (s_address_file_model, s_address_drug_file, s_address_file_inputnode,
         l_parameters[2], l_parameters[7], l_parameters[8], l_parameters[9],
         l_parameters[10], l_parameters[11])
        for s_address_file_model in l_list_mutation_files
        for s_address_drug_file in l_list_drug_files
    ]

    #print(s_address_folder_models, s_address_file_drug, i_iteration, type(i_iteration), l_s_targets)

    l_list_for_check = p.map(calculate_model_with_setted_parameter,
                             l_list_inputs)
    if l_list_mutation_files != l_list_for_check:
        for s_mutation_file in l_list_mutation_files:
            if s_mutation_file not in l_list_for_check:
                print(s_mutation_file +
                      " is not calculated! by multiprocessing error!")
def main(fname, output=None):
    print("Input name", fname)
    print("Output name", output)

    tellpath = "/home/jneal/Phd/Codes/Phd-codes/WavelengthCalibration/testfiles/"  # Updated for git repo
   
    #ext = 0   # for testing
    #hdr, data = Get_DRACS(fname, ext)  #Get DRACS only finds relevant file in the path folder
    hdr = fits.getheader(fname)
    print("Observation time ", hrd[])
    data = fits.getdata(fname)
    
    uncalib_combined = data["Combined"]
    #uncalib_noda = uncalib_data["Nod A"]
    #uncalib_nodb = uncalib_data["Nod B"]
    uncalib_data = [range(1,len(uncalib_combined)+1), uncalib_combined]
    # get hdr information and then find coresponding Telluric spectra
    calib_data = IOmodule.read_2col(tellpath + "Telluric_spectra_CRIRES_Chip-" + 
                                    str(1) + ".txt")
    
    gf.print_fit_instructions()

    rough_a, rough_b = gf.get_rough_peaks(uncalib_data[0], uncalib_data[1], calib_data[0], calib_data[1])
    rough_x_a = [coord[0] for coord in rough_a]
    rough_x_b = [coord[0] for coord in rough_b]
    good_a, good_b = gf.adv_wavelength_fitting(uncalib_data[0], uncalib_data[1], 
                                       rough_x_a, calib_data[0], calib_data[1],
                                       rough_x_b)

    wl_map = gf.wavelength_mapping(good_a, good_b)

    calibrated_wl = np.polyval(wl_map, uncalib_data[0])
    
    plt.plot(calibrated_wl, uncalib_data[1], label="Calibrated spectra")
    plt.plot(calib_data[0], calib_data[1], label="Telluric spectra")
    plt.title("Calibration Output")
    plt.show()
Esempio n. 25
0
def runAndSaveMatrices():

    """
    Transform a directory of matrices to a directory of decomposed matrices.
    """

    files = IOmodule.getSortedFilelist(_oldMatrixDir+'/')

    for file in files:

        M_coo=IOmodule.readInTDM(_oldMatrixDir,file)

        # Make sure the matrix contains information (1-dim. is an empty matrix)
        if M_coo.shape[0]==1:
            continue

        print "Shape:"+str(M_coo.shape)

        # SVD does not run well single dimenstion matrices
        ## (remembering that the first dimension is indices and does not count)
        if M_coo.shape[0]>2:
            M_dense=M_coo.todense()

            # Run SVD
            U,Sig,Vt=_svd(M_dense)

            # Get the reduced semantic space
            S= _semanticSpace(U,Sig,Vt,_reduceBy)

            # Recombine the indices and the reduced matrix
            M_dense[1:,1:]=S.todense()

            # Save the matrix
            M_coo=sparse.coo_matrix(M_dense)

            IOmodule.writeOutTDM(_newMatrixDir, file, M_coo)
            print ''
        else:
            print "Dimensionality too low for svd"
            IOmodule.writeOutTDM(_newMatrixDir, file, M_coo)
            print ''
Esempio n. 26
0
# Main folder
_path = os.getenv("HOME") + "/" + "The_Hive"
# Sub folder
_subFolder = _path + "/" + "term_doc"
# Hashtable directory
_hashTablePath = _subFolder + "/" + "hashTables"
# Set True for Porter-stemming
_stemmer = True
# For term document set to True, for DiseaseMatrix set to false.
_termdocumentmatrix = False
# Cosine measure = True, Sum measure = False
_cosineMeasure = False
############
if _termdocumentmatrix:
    # Disease label hash (for pmid lookup)
    _labelHash = IOmodule.pickleIn(_hashTablePath, "labelHash")
    print "Label hash loaded"
else:
    # Disease label hash (for label lookup)
    _diseaseHash = IOmodule.pickleIn(_hashTablePath, "diseaseHash") #_reduced")
    _labelHash = dict(zip(_diseaseHash.values(), _diseaseHash.keys()))
    print "Disease hash loaded"
############

def search(M_lil, M_csc, queryString, top=20):

    """
    This function is still a work in progress..
    """
    
    sanitizer = TextCleaner.sanitizeString()
Esempio n. 27
0
# Term- and PMID-hash directory
_hashTablesDir=_path+'/'+'term_doc'+'/'+"hashTables"

# If subFolder do not exists
if not os.path.isdir(_path+'/'+subFolder):
    os.mkdir(_path+'/'+subFolder)

_stemmer=True

####################################################################
#### Use stopword-removed TermDoc ##################################
####################################################################

if not _stemmer:
    # Hashes
    termHashTable=IOmodule.pickleIn(_hashTablesDir, "termHash")
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, "pmidHash")
    revPmidHashTable=dict(zip(pmidHashTable.values(),pmidHashTable.keys()))

####################################################################
#### Use stopword-removed and Porter-stemmed (english) TermDoc: ####
####################################################################
else:
    # Stemmed hashes
    termHashTable=IOmodule.pickleIn(_hashTablesDir, "termHash_stemmed")
    pmidHashTable=IOmodule.pickleIn(_hashTablesDir, "pmidHash_stemmed")
    revPmidHashTable=dict(zip(pmidHashTable.values(),pmidHashTable.keys()))


print "Hashes loaded"
Esempio n. 28
0
 # Load the precomputed length of each column in the stemmed term-doc matrix
#_termSum = IOmodule.pickleIn(_hashTablePath,_CLHash)


#######################################################################################
#### Use stopword-removed, Porter-stemmed (english) and TFIDF-prefiltered TermDoc: ####
#######################################################################################

 # TFIDF-matrix file name
_tfidfName = "label_TFIDFMatrix_reduced_90"
 # Vector-norm hash for then TFIDFMatrix
#_RLHash = "RLHash_tfidf_stemmed"
 # Hash for the number of documents each term occur in
_CLHash = "svdlabel_CLHash"
 # Load the precomputed length of each column in the stemmed term-doc matrix
_termSum = IOmodule.pickleIn(_hashTablePath,_CLHash)

####################################################################

print "Hashes loaded."


def _generateLogTFIDF(M_coo):

    """
    Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix,
    using log-transformation on TF and IDF.

    Returns a sparse lil_matrix to be used for vector-normalization.
    """
if __name__ == "__main__":

    # Do the test case HD30501-1 Chip-1
    path = "/home/jneal/Phd/Codes/Phd-codes/WavelengthCalibration/testfiles/"  # Updated for git repo
    #global param_nums
    #param_nums = 3 
    Dracspath = "/home/jneal/Phd/data/Crires/BDs-DRACS/"
    obj = "HD30501-1"
    objpath = Dracspath + obj + "/"
    chip = 0   # for testing
    hdr, DracsUncalibdata = Get_DRACS(objpath,0)
    UnCalibdata_comb = DracsUncalibdata["Combined"]
    #UnCalibdata_noda = DracsUncalibdata["Nod A"]
    #UnCalibdata_nodb = DracsUncalibdata["Nod B"]
    UnCalibdata = [range(1,1025), UnCalibdata_comb]
    Calibdata = IOmodule.read_2col(path + "Telluric_spectra_CRIRES_Chip-" + 
                                    str( 1) + ".txt")
    #print(Calibdata)
    #get_rough_peaks()
    Test_pxl_pos = [58.4375, 189.0625, 583.0, 688.1875, 715.6875, 741.8125, 
                    971.4375]
    Test_wl_pos = [2112.4897175403221, 2114.0333233870965, 2118.5533179435483,
                   2119.748622983871, 2120.0573441532256, 2120.358149395161, 
                   2122.9624895161287]

    CoordsA = Test_pxl_pos  # first one for now
    CoordsB = Test_wl_pos

    print_fit_instructions()
    #Comment Below line to skip this and work on next part
    good_coords_a, good_coords_b = adv_wavelength_fitting(UnCalibdata[0], UnCalibdata[1], 
                                       CoordsA, Calibdata[0], Calibdata[1],
Esempio n. 30
0
def createTermAndPmidHashes():

    """
    This function creates two hash tables of the PMID's and terms to be used
    for the term-doc matrix.

    Note that the terms are sanitized for any non-alphanumerical characters.
    And it is default to remove stop words.
    """

    medlineDir = _medlineDir
    hashTables = _hashTablesDir
    termHashTable={}
    pmidHashTable={}
    termCounter = 0
    pmidCounter = 0

    files = IOmodule.getSortedFilelist(medlineDir+'/')
#    files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)])

    # Get the regex pattern that sanitizeses strings.
    sanitizer = TextCleaner.sanitizeString()

    for file in files:
        records = RecordHandler.loadMedlineRecords(medlineDir, file)

        # *Note*
        # Parts of the following loops could be optimized by using dictionaries
        # for direct loopkups instead of linear lookups, but since it's not
        # important, optimization will have to wait for another day.

        # Hash PMID's
        for diseaseRecords in records.values():
            for record in diseaseRecords:
                pmid=record[0]
                if pmid not in pmidHashTable:
                    pmidCounter+=1
                    pmidHashTable[pmid]=pmidCounter

                information=''
                # Get the abstract
		try:
			information=' '+record[1]['AB']
		except:
			print 'Unable to get abstract', record[0]
		try:
			information+=' '+record[1]['TI']
		except:
			print 'Unable to get title for', record[0]

		if 'MH' in record[1]:
			for meshterm in record[1]['MH']:
				information+=' '+meshterm
		# We do not want to print this, as most of the
		# records do not have MeSH.
		# print 'Unable to get MeSH terms for', record[0]
		
                # Sanitize the information
                information=sanitizer.sub(' ', information)
                # remove stopwords from the abstract
                information=FilterInterface.stopwordRemover(information)

                # OPTIONAL:
                # Stem the abstract
                if _stemmer: information=FilterInterface.porterStemmer(information)

                termList = [word for word in information.split(' ') if word != '']
                for term in termList:
                    if term not in termHashTable:
                        termCounter+=1
                        termHashTable[term]=termCounter
                    else: continue
                
        print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed."

    IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable)
    IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable)

    return termHashTable, pmidHashTable
Esempio n. 31
0
def fetchPubmedDiseaseTerms(pages):

    """
    Takes a URL-list of pages to crawl for pubmed terms, uids and optional
    describtions.

    Returns a dictionary on the form:
    {DiseaseName:{db='',terms:'',syn=[],uid:'',desc:''}}
    """

    pubmedURLs={}

    printvar=0
    pagenumber=0
    desccounter=0
    for page in pages:
        pagenumber+=1
        
        # Open the page.
        for i in range(3):
            try:
                c=urllib2.urlopen(page)
            except:
                print "Could not open %s" % page
                print "Attempt",str(i+1),"out of 3"
                sleep(5)
                if i==2:
                    print "Could not open page. Terminating.."
                    raise StopIteration()

        try:
            soup=BeautifulSoup(c.read())
        except HTMLParseError:
            print 'Experienced difficulties opening %s' % page
            IOmodule.writeOutTxt(_subFolder+"/"+diseaseFolder+'/'+errorFolder,strftime('%H%M%S'),page)
            continue

        # Get disease name.
        title=soup.html.head.title.string

        # Some pages are 'officially' not working. Catch them here.
        if title=='NIH Office of Rare Diseases Research (ORDR) - Error':
            IOmodule.writeOutTxt(_subFolder+"/"+diseaseFolder+'/'+errorFolder,'Page error'+strftime('%H%M%S'),page)
            print 'Page Error on %s' % page
            continue

        # Allocate dictionary.
        pubmedURLs[title]={}
        pubmedURLs[title]['db']='pubmed'    # ..database to search in (pubmed by default)
        pubmedURLs[title]['terms']=''       # ..handcrafted search term
        pubmedURLs[title]['syn']=[]         # ..disease synonyms
        pubmedURLs[title]['uid']=''         # ..search id
        pubmedURLs[title]['desc']=''        # ..optional disease description

        # Check for PubMed direct links.
        links=soup('a')
        found=False
        for link in links:
            if (link.contents):
                if ((link.contents[0] == 'PubMed')) & ('href' in dict(link.attrs)):
                    urlString = link['href'].lower()
                    # If there is a PubMed direct link and it's an id:
                    if ('uid=' in urlString) | ('uids=' in urlString) | ('idsfromresult=' in urlString):
                        tokens = _parseURL(urlString)
                        uid = tokens['from_uid']
                        pubmedURLs[title]['uid'] = uid
                        pubmedURLs[title]['db'] = tokens['db']
                        printvar += 1
                        found = True
                        print 'Found', str(printvar), 'PubMed terms/uids.', title
                        continue
                    # If there is a PubMed direct link and it's a handcrafted term:
                    elif ('term=' in urlString):
                        tokens = _parseURL(urlString)
                        terms = tokens['term']
                        pubmedURLs[title]['terms'] = terms
                        pubmedURLs[title]['db'] = tokens['db']
                        printvar += 1
                        found = True
                        print 'Found', str(printvar), 'PubMed terms/uids.', title
                        continue
                    # Special case 1: If there is a PubMed direct link but the uid is not part of the tokens.
                    elif ('/entrez/' not in urlString):
                        start = urlString.find('/pubmed/') + 8
                        if '?' in urlString:
                            end = urlString.find('?')
                            uid = urlString[start:end]
                        else:
                            uid = urlString[start:]
                        print uid
                        pubmedURLs[title]['uid'] = uid
                        printvar += 1
                        found = True
                        print 'Found', str(printvar), 'PubMed terms/uids.', title, '. (Special case 1: No tokens)'
                    # Special case 2: If there is a webenv, the url is (by experience) not working but the disease name is still valuable for a pbumed search.
                    elif '&webenv=' in urlString:
                        printvar += 1
                        found = True
                        print 'Found', str(printvar), 'PubMed terms/uids.', title, '. (Special case 2: WebEnv)'

                # Terminate if an unexpected url shows up
                if link.contents:
                    if (not found) & (link.contents[0]=='PubMed'):
                        print 'Could not fetch url'
                        raise StopIteration()

        # A simple addition to the printouts.
        if not found:
            printvar+=1
            print 'Found',str(printvar),'Diseases.',title,'(no uid or term).'

        # Notify if an unexpected database shows up.
        if ((pubmedURLs[title]['db']!='')&(pubmedURLs[title]['db']!='omim')&(pubmedURLs[title]['db']!='pubmed')):
            print "*****Found different db:",pubmedURLs[title]['db']
            print 'Could not fetch url'
            raise StopIteration()

        # Disease synonyms are also added to the term list.
        lis=soup('li')
        for li in lis:
            if ('synonym' in str(li.parent)):
                synonym=li.contents[0]
                if (',') in str(synonym):
                    aditionalSynonyms=synonym.split(',')
                    for syn in aditionalSynonyms:
                        pubmedURLs[title]['syn'].append(syn)
                        print '  ' + syn
                else:
                    pubmedURLs[title]['syn'].append(synonym)
                    print '  ' + synonym

        # Look for a optional disease description on rarediseases.info.nih.gov.
        descs=soup('span')
        for desc in descs:
            if ('id' in dict(desc.attrs)):
                idString=desc['id'].lower()
                if (('descriptionquestion' in idString) & ('#003366' not in str(desc))):
                    desc=_cleanString(str(desc))
                    pubmedURLs[title]['desc']=desc
                    desccounter+=1
                    print '    *Found optional disease description'
        print ''
        
        if ((pagenumber%20)==0):
            # Print status report
            print '*****************************************************'
            print 'Total pages looked in:',str(pagenumber),'\nPages found:',str(printvar),'\nRemaining in total:',(len(pages)-printvar),'out of',len(pages),'\nDescriptions found:',str(desccounter)
            print '*****************************************************'
            print 'Writing to files...'
            # Write out and flush dictionary
            for disease in pubmedURLs:
                # Remove some problematic tokens from the file name
                content=pubmedURLs[disease]
                disease=removeSlashes.sub(' ',disease)
                disease=removeCommas.sub(' ',disease)
                # Write out
                IOmodule.writeOutTxt(diseaseFolder,disease,content)
            pubmedURLs={}
            print 'Wrote successfully. Dictionary flushed.'
Esempio n. 32
0
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False):

    """
    Recieves a subMatrixDir goes through all the files and sums up the
    column of it, creating a single row vector containing the sum of
    all column in the sub term doc matrix. It then proceeds to making
    a disease term doc, based on these row vector

    Optional flags are:

    avg, takes the average over the columns of the sub matrices
    instead of the sum.

    output, makes the funtion produce additional output

    time_log, makes the function print out how much time is spend on
    what
    """

    if output:
        print 'Initialising...'

    if time_log:
        t1 = time.time()

    files = IO.getSortedFilelist(subMatrixDir)

    termHashTable = IO.pickleIn(_hashTablesDir, _termHash)
    diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash)

    diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1))

    # Initialize subTermSum to something
    subTermSum = sparse.lil_matrix((1,1))

    if output:
        print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1))
        count = 0

    if time_log:
        print 'Time for initialization:', str(time.time() - t1)[:4]

    for f in files:
        if time_log:
            t2 = time.time()
        diseaseName = f[0:f.find('.mtx')]
        if output:
            print 'Processing', diseaseName
            count+=1
            print 'Numbers remaining', len(files)-count

        subTermDoc = IO.readInTDM(subMatrixDir, diseaseName)
        subTermDoc = subTermDoc.tolil()

        # If the subTermDoc contains nothing, just skip it
        if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1):
            continue
        
        subTermSum = getColumnSum(subTermDoc,avg)
        subTermSum[0,0] = diseaseHashTable[diseaseName]
        subTermSum[0,:] = subTermDoc.getrow(0)

        diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName]
        
        if time_log:
            print 'Time for', diseaseName, str(time.time() - t2)[:4]
            t3 = time.time()

        if output:
            print 'Filling in values in disease matrix for', diseaseName
        for columnIndex in range(1,subTermSum.shape[1]):
            diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex]
        if time_log:
            print 'Values filled into disease matrix in', str(time.time() - t3)[:4]
        if output:
            print 'Completed filling in values.'

    # Hack way of making term hashes
    diseaseMatrix[0,:] = range(0,len(termHashTable))
    
    if output:
        print 'Done making disease matrix, writing to'

    IO.writeOutTDM(_termDocDir, label, diseaseMatrix)

    if output:
        print 'Done writing disease matrix.'
        
    return diseaseMatrix