Esempio n. 1
0
 def __init__(self, print_param=0, alignment_type='local', gap_scores=[0.2,0.05]):
     Aligner.__init__(self, print_param, alignment_type, gap_scores)
     # PET91 
     self.scoring_matrix =  {'AA':15/15.0, 'A-':6/15.0,
                             'AR': 7/15.0, 'RR':15/15.0, 'R-':6/15.0,
                             'AN': 9/15.0, 'RN': 8/15.0, 'NN':15/15.0, 'N-':6/15.0,
                             'AD': 8/15.0, 'RD': 6/15.0, 'ND':13/15.0, 'DD':15/15.0, 'C-': 6/15.0,
                             'AC': 7/15.0, 'RC': 8/15.0, 'NC': 7/15.0, 'DC': 3/15.0, 'CC':15/15.0, 'Q-': 6/15.0,
                             'AQ': 7/15.0, 'RQ':12/15.0, 'NQ': 9/15.0, 'DQ': 9/15.0, 'CQ': 4/15.0, 'QQ':15/15.0, 'E-': 6/15.0,
                             'AE': 8/15.0, 'RE': 7/15.0, 'NE': 9/15.0, 'DE':15/15.0, 'CE': 2/15.0, 'QE':12/15.0, 'EE':15/15.0, 'G-': 6/15.0,
                             'AG':10/15.0, 'RG': 9/15.0, 'NG': 9/15.0, 'DG':10/15.0, 'CG': 7/15.0, 'QG': 6/15.0, 'EG': 9/15.0, 'GG':15/15.0, 'H-': 6/15.0,
                             'AH': 6/15.0, 'RH':12/15.0, 'NH':12/15.0, 'DH': 9/15.0, 'CH': 8/15.0, 'QH':14/15.0, 'EH': 7/15.0, 'GH': 6/15.0, 'HH':15/15.0, 'I-': 6/15.0,
                             'AI': 9/15.0, 'RI': 4/15.0, 'NI': 6/15.0, 'DI': 3/15.0, 'CI': 5/15.0, 'QI': 4/15.0, 'EI': 3/15.0, 'GI': 4/15.0, 'HI': 4/15.0, 'II':15/15.0, 'L-': 6/15.0,
                             'AL': 6/15.0, 'RL': 5/15.0, 'NL': 4/15.0, 'DL': 2/15.0, 'CL': 5/15.0, 'QL': 7/15.0, 'EL': 2/15.0, 'GL': 2/15.0, 'HL': 6/15.0, 'IL':12/15.0, 'LL':15/15.0, 'K-': 6/15.0,
                             'AK': 6/15.0, 'RK':14/15.0, 'NK':11/15.0, 'DK':12/15.0, 'CK': 4/15.0, 'QK':12/15.0, 'EK':10/15.0, 'GK': 6/15.0, 'HK': 9/15.0, 'IK': 4/15.0, 'LK': 4/15.0, 'KK':15/15.0, 'M-': 6/15.0,
                             'AM': 8/15.0, 'RM': 6/15.0, 'NM': 5/15.0, 'DM': 3/15.0, 'CM': 5/15.0, 'QM': 6/15.0, 'EM': 4/15.0, 'GM': 4/15.0, 'HM': 5/15.0, 'IM':13/15.0, 'LM':13/15.0, 'KM': 6/15.0, 'MM':15/15.0, 'F-': 6/15.0,
                             'AF': 4/15.0, 'RF': 2/15.0, 'NF': 4/15.0, 'DF': 1/15.0, 'CF': 8/15.0, 'QF': 3/15.0, 'EF': 0/15.0, 'GF': 1/15.0, 'HF': 8/15.0, 'IF': 9/15.0, 'LF':12/15.0, 'KF': 1/15.0, 'MF': 8/15.0, 'FF':15/15.0, 'P-': 6/15.0,
                             'AP':10/15.0, 'RP': 8/15.0, 'NP': 7/15.0, 'DP': 5/15.0, 'CP': 5/15.0, 'QP':10/15.0, 'EP': 5/15.0, 'GP': 6/15.0, 'HP': 9/15.0, 'IP': 5/15.0, 'LP': 8/15.0, 'KP': 6/15.0, 'MP': 5/15.0, 'FP': 5/15.0, 'PP':15/15.0, 'S-': 6/15.0,
                             'AS':12/15.0, 'RS': 8/15.0, 'NS':12/15.0, 'DS': 8/15.0, 'CS':10/15.0, 'QS': 7/15.0, 'ES': 7/15.0, 'GS':10/15.0, 'HS': 8/15.0, 'IS': 7/15.0, 'LS': 7/15.0, 'KS': 7/15.0, 'MS': 7/15.0, 'FS': 7/15.0, 'PS':11/15.0, 'SS':15/15.0, 'T-': 6/15.0,
                             'AT':12/15.0, 'RT': 7/15.0, 'NT':11/15.0, 'DT': 7/15.0, 'CT': 7/15.0, 'QT': 7/15.0, 'ET': 6/15.0, 'GT': 8/15.0, 'HT': 7/15.0, 'IT':10/15.0, 'LT': 6/15.0, 'KT': 8/15.0, 'MT':10/15.0, 'FT': 4/15.0, 'PT':10/15.0, 'ST':12/15.0, 'TT':15/15.0, 'W-': 6/15.0,
                             'AW': 2/15.0, 'RW': 9/15.0, 'NW': 2/15.0, 'DW': 1/15.0, 'CW':10/15.0, 'QW': 4/15.0, 'EW': 1/15.0, 'GW': 7/15.0, 'HW': 4/15.0, 'IW': 3/15.0, 'LW': 6/15.0, 'KW': 4/15.0, 'MW': 4/15.0, 'FW': 7/15.0, 'PW': 2/15.0, 'SW': 5/15.0, 'TW': 3/15.0, 'WW':15/15.0, 'Y-': 6/15.0,
                             'AY': 3/15.0, 'RY': 5/15.0, 'NY': 8/15.0, 'DY': 6/15.0, 'CY':12/15.0, 'QY': 6/15.0, 'EY': 3/15.0, 'GY': 2/15.0, 'HY':14/15.0, 'IY': 5/15.0, 'LY': 6/15.0, 'KY': 3/15.0, 'MY': 4/15.0, 'FY':15/15.0, 'PY': 3/15.0, 'SY': 7/15.0, 'TY': 4/15.0, 'WY': 8/15.0, 'YY':15/15.0, 'V-': 6/15.0,
                             'AV':11/15.0, 'RV': 4/15.0, 'NV': 5/15.0, 'DV': 5/15.0, 'CV': 7/15.0, 'QV': 4/15.0, 'EV': 5/15.0, 'GV': 6/15.0, 'HV': 4/15.0, 'IV':15/15.0, 'LV':11/15.0, 'KV': 4/15.0, 'MV':12/15.0, 'FV': 8/15.0, 'PV': 7/15.0, 'SV': 7/15.0, 'TV':10/15.0, 'WV': 4/15.0, 'YV': 4/15.0, 'VV':15/15.0}
     for key in self.scoring_matrix.keys():
         if key[1]+key[0] not in self.scoring_matrix.keys():
             self.scoring_matrix[key[1]+key[0]] = self.scoring_matrix[key[0]+key[1]]
Esempio n. 2
0
def main(aligner_fname, cascade_fname, image_fnames):
    aligner = Aligner(aligner_fname)
    cascade = load_cascade(cascade_fname)
    images = (pvImage(fname) for fname in image_fnames)
    return [
        aligner.align_face(detect_faces(img, cascade)[0], img)
        for img in images
    ]
Esempio n. 3
0
def alignDependingOnWithDuration(URIrecordingNoExt, whichSection, pathToComposition, withDuration, withSynthesis, evalLevel, params, usePersistentFiles, htkParser):
    '''
    call alignment method depending on whether duration or htk  selected 
    '''

    Phonetizer.initLookupTable(withSynthesis)
    
    tokenLevelAlignedSuffix, phonemesAlignedSuffix = determineSuffix(withDuration, withSynthesis, evalLevel)
    
    
    if withDuration:
        alignmentErrors, detectedWordList, grTruthDurationWordList = alignOneChunk(URIrecordingNoExt, pathToComposition, whichSection, htkParser, params, evalLevel, usePersistentFiles)
        
            
    else:
        URIrecordingAnno = URIrecordingNoExt + ANNOTATION_EXT
        URIrecordingWav = URIrecordingNoExt + AUDIO_EXTENSION
        # new makamScore used
        lyricsObj = loadLyrics(pathToComposition, whichSection)
        lyrics = lyricsObj.__str__()
#         in case  we are at no-lyrics section
        if not lyrics or lyrics =='_SAZ_':
            logger.warn("skipping section {} with no lyrics ...".format(whichSection))
            return [], [], [], []
    
        outputHTKPhoneAlignedURI = Aligner.alignOnechunk(MODEL_URI, URIrecordingWav, lyrics.__str__(), URIrecordingAnno, '/tmp/', withSynthesis)
        alignmentErrors = evalAlignmentError(URIrecordingAnno, outputHTKPhoneAlignedURI, evalLevel)
        detectedWordList = outputHTKPhoneAlignedURI
        grTruthDurationWordList = []
    
    # store decoding results in a file FIXME: if with duration it is not mlf 
    detectedAlignedfileName = []
    detectedAlignedfileName =  tokenList2TabFile(detectedWordList, URIrecordingNoExt, tokenLevelAlignedSuffix)
        
    return alignmentErrors, detectedWordList, grTruthDurationWordList, detectedAlignedfileName
    def alignOneChunk(pathToHtkModel, path_TO_OUTPUT, lyrics,
                      currPathToAudioFile, isLyricsFromFile, withSynthesis):

        if not (os.path.isdir(path_TO_OUTPUT)):
            os.mkdir(path_TO_OUTPUT)

        chunkAligner = Aligner(pathToHtkModel, currPathToAudioFile, lyrics,
                               isLyricsFromFile, withSynthesis)

        baseNameAudioFile = os.path.splitext(
            os.path.basename(chunkAligner.pathToAudioFile))[0]

        outputHTKPhoneAlignedURI = os.path.join(
            path_TO_OUTPUT, baseNameAudioFile) + HTK_MLF_ALIGNED_SUFFIX

        chunkAligner.alignAudio(0, path_TO_OUTPUT, outputHTKPhoneAlignedURI)

        return outputHTKPhoneAlignedURI
Esempio n. 5
0
def alignDependingOnWithDuration(URIrecordingNoExt, sectionLink, pathToComposition, withDuration, withSynthesis, evalLevel, params, usePersistentFiles, htkParser):
    '''
    call alignment method depending on whether duration or htk  selected 
    '''
    #### 1) load lyrics
   
    makamScore = loadMakamScore(pathToComposition)
    
    lyrics = makamScore.getLyricsForSection(sectionLink.melodicStructure)
    
    lyricsStr = lyrics.__str__()
        
    if not lyricsStr or lyricsStr=='None' or  lyricsStr =='_SAZ_':
            logger.warn("skipping sectionLink {} with no lyrics ...".format(sectionLink.melodicStructure))
            return [], 'dummy', 0, 0, 0
    
    ##############
    ## reference duration
#     correctDurationScoreDev, totalDuration  = getReferenceDurations(URIrecordingNoExt, lyricsWithModels, evalLevel)
    correctDurationScoreDev = 0
    
    tokenLevelAlignedSuffix, phonemesAlignedSuffix = determineSuffix(withDuration, withSynthesis, evalLevel)
    alignmentErrors = []
    
    if withDuration:

        withOracle = 0
        oracleLyrics = 'dummy'
        detectedTokenList, detectedPath, maxPhiScore = alignOneChunk( lyrics, withSynthesis, withOracle, oracleLyrics, [], params.ALPHA,  usePersistentFiles, tokenLevelAlignedSuffix, URIrecordingNoExt, sectionLink, htkParser)
        logger.debug('maxPhiScore: ' + str(maxPhiScore) )

        correctDuration = 0
        totalDuration = 1
#         correctDuration, totalDuration = _evalAccuracy(URIrecordingNoExt + ANNOTATION_EXT, detectedTokenList, evalLevel )
#         detectedTokenList = test_oracle(URIrecordingNoExt, pathToComposition, whichSection)
            
    else:
        URIrecordingAnno = URIrecordingNoExt + ANNOTATION_EXT
        URIrecordingWav = URIrecordingNoExt + AUDIO_EXTENSION
        # new makamScore used
#         lyricsObj = loadLyrics(pathToComposition, whichSection)
#         lyrics = lyricsObj.__str__()
# #         in case  we are at no-lyrics sectionLink
#         if not lyrics or lyrics=='None' or  lyrics =='_SAZ_':
#             logger.warn("skipping sectionLink {} with no lyrics ...".format(whichSection))
#             return [], [], [], []
    
        outputHTKPhoneAlignedURI = Aligner.alignOnechunk(MODEL_URI, URIrecordingWav, lyricsStr, URIrecordingAnno, '/tmp/', withSynthesis)
        alignmentErrors = []
        alignmentErrors = evalAlignmentError(URIrecordingAnno, outputHTKPhoneAlignedURI, evalLevel)
        detectedTokenList = outputHTKPhoneAlignedURI
        
#         correctDuration, totalDuration = evalAccuracy(URIrecordingAnno, outputHTKPhoneAlignedURI, evalLevel)
        
     
    return alignmentErrors,  correctDuration, totalDuration, correctDurationScoreDev, maxPhiScore
def add_JAMR_align(instances,aligned_amr_file):
    comments,amr_strings = readAMR(aligned_amr_file)
    for i in range(len(instances)):
        amr = AMR.parse_string(amr_strings[i])
        alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
        ggraph.pre_merge_netag(instances[i])
        #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
        instances[i].addAMR(amr)
        instances[i].addGoldGraph(ggraph)
Esempio n. 7
0
def add_JAMR_align(instances, aligned_amr_file):
    comments, amr_strings = readAMR(aligned_amr_file)
    for i in range(len(instances)):
        amr = AMR.parse_string(amr_strings[i])
        alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr, alignment, instances[i].tokens)
        ggraph.pre_merge_netag(instances[i])
        #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
        instances[i].addAMR(amr)
        instances[i].addGoldGraph(ggraph)
Esempio n. 8
0
def main(epitope_list=None,
         alignments_dir=None,
         alignment_score_threshold=None,
         slope_parameter=None,
         output_file=None):

    # Compute MHC amplitudes for all neoantigens
    a_val_by_index = {}
    peptide_by_index = {}
    sample_by_index = {}

    with open(epitope_list) as f:
        for data in csv.DictReader(f, delimiter='\t'):
            index = data['id']
            sample = data['sample']
            mtpeptide = data['epitope']
            kdwt = data['wt_score']
            kdmt = data['mt_score']
            kdmt = float(kdmt)
            if kdwt == 'nan':
                kdwt = 1000.
            kdwt = float(kdwt)
            index = int(index)
            peptide_by_index[index] = mtpeptide.upper()
            a_val_by_index[index] = kdwt / kdmt
            sample_by_index[index] = sample

    # Compute TCR-recognition probabilities for all neoantigens
    aligner = Aligner()
    for sname in set(sample_by_index.values()):
        xml_path = join(alignments_dir, f'neoantigens_{sname}_iedb.xml')
        aligner.read_all_blast_alignments(xml_path)
    aligner.compute_rval(alignment_score_threshold, slope_parameter)

    # Compute qualities for all epitopes and write the result
    with open(output_file, 'w') as out:
        header = [
            'Sample', 'NeoantigenID', 'MT.Peptide.Form', 'NeoantigenQuality',
            'NeoantigenAlignment', 'IEDB_EpitopeAlignment', 'AlignmentScore',
            'IEDB_Epitope'
        ]
        out.write('\t'.join(header) + '\n')
        for index, peptide in peptide_by_index.items():
            a_val = a_val_by_index[index]
            [r_val, species, alignment] = aligner.get_rval(index)

            neo_alignment = alignment[0]
            epitope_alignment = alignment[1]
            score = alignment[2]

            quality = a_val * r_val
            res = [
                sample_by_index[index], index, peptide, quality, neo_alignment,
                epitope_alignment, score, species
            ]
            out.write('\t'.join(map(str, res)) + '\n')
Esempio n. 9
0
def main(argv):

    '''
    command line parameters:
    neofile - text file with neoantigen data (supplementary data)
    alignmentDirectory - folder with precomputed alignments
    a - midpoint parameter of the logistic function, alignment score threshold
    k - slope parameter of the logistic function
    outfile - path to a file where to output neoantigen fitness computation
    '''

    neofile=argv[1]
    alignmentDirectory=argv[2]
    a=float(argv[3])
    k=float(argv[4])
    outfile=sys.argv[5]
    nmerl=float(argv[6])

    [neoantigens,samples]=readNeoantigens(neofile, nmerl)
    #Compute TCR-recognition probabilities for all neoantigens
    aligner=Aligner()
    for sample in samples:
        xmlpath=alignmentDirectory+"/neoantigens_"+sample+"_iedb.xml"
        aligner.readAllBlastAlignments(xmlpath)
    aligner.computeR(a, k)

    #Write neoantigen recognition potential
    of=open(outfile,'w')
    header=["NeoantigenID","Mutation","Sample","MutatedPeptide","ResidueChangeClass","MutantPeptide","WildtypePeptide","A","R","Excluded","NeoantigenRecognitionPotential"]
    header="\t".join(header)
    of.write(header+"\n")
    for i in neoantigens:
        neoantigen=neoantigens[i]
        w=neoantigen.getWeight() #excludes neoantigens that mutated from a nonhydrophobic residue on position 2 or 9
        A=neoantigens[i].getA() #MHC amplitude A
        mtpeptide=neoantigens[i].mtPeptide #mutant peptide
        wtpeptide=neoantigens[i].wtPeptide
        R=aligner.getR(i)

        # Residue change:
        # HH: from hydrophobic to hydrophobic,
        # NN: from non-hydrophobic to non-hydrophobic
        # HN: from hydrophobic to non-hydrophobic,
        # NH: from non-hydrophobic to hydrophobic
        # other (WW, WH, HW, NW, WN) which include aminoacids without a clear classification
        residueChange=neoantigen.residueChange

        fitnessCost=A*R*w

        l=[i, neoantigen.mid, neoantigen.sample, neoantigen.position, residueChange, mtpeptide, wtpeptide, A,R, 1-w, fitnessCost]#, neoAlignment, epitopeAlignment, score, species]
        l="\t".join(map(lambda s: str(s),l))
        of.write(l+"\n")
Esempio n. 10
0
def _init_instances(sent_file, amr_strings, comments):
    print >> log, "Preprocess 1:pos, ner and dependency using stanford parser..."
    proc = StanfordCoreNLP()
    instances = proc.parse(sent_file)

    print >> log, "Preprocess 2:adding amr and generating gold graph"
    assert len(instances) == len(amr_strings)
    for i in range(len(instances)):
        amr = AMR.parse_string(amr_strings[i])
        instances[i].addAMR(amr)
        alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr, alignment, comments[i]['snt'])
        ggraph.pre_merge_netag(instances[i])
        instances[i].addGoldGraph(ggraph)

    return instances
def _init_instances(sent_file,amr_strings,comments):
    print >> log, "Preprocess 1:pos, ner and dependency using stanford parser..."
    proc = StanfordCoreNLP()
    instances = proc.parse(sent_file)
    
    
    print >> log, "Preprocess 2:adding amr and generating gold graph"
    assert len(instances) == len(amr_strings)
    for i in range(len(instances)):
        amr = AMR.parse_string(amr_strings[i])
        instances[i].addAMR(amr)
        alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr,alignment,comments[i]['snt'])
        ggraph.pre_merge_netag(instances[i])
        instances[i].addGoldGraph(ggraph)

    return instances
Esempio n. 12
0
def main(argv):

    if len(argv) != 4:
        print(
            "usage: {}  <pathToComposition> <whichSection> <URI_recording_no_ext>"
            .format(argv[0]))
        sys.exit()

    URIrecordingNOExt = '/Users/joro/Documents/Phd/UPF/adaptation_data_soloVoice/ISTANBUL/goekhan/02_Gel_3_zemin'
    URIrecordingNOExt = argv[3]
    URIrecordingWav = URIrecordingNOExt + AUDIO_EXTENSION

    pathToComposition = '/Users/joro/Documents/Phd/UPF/adaptation_data_soloVoice/nihavent--sarki--aksak--gel_guzelim--faiz_kapanci/'
    pathToComposition = argv[1]

    whichSection = 3
    whichSection = int(argv[2])

    lyrics = loadLyrics(pathToComposition, whichSection)

    withSynthesis = 1

    URIrecordingAnno = URIrecordingNOExt + PHRASE_ANNOTATION_EXT

    outputHTKPhoneAlignedURI = Aligner.alignOnechunk(MODEL_URI,
                                                     URIrecordingWav, lyrics,
                                                     URIrecordingAnno, '/tmp/',
                                                     withSynthesis)
    EVALLEVEL = 2

    alignmentErrors = evalAlignmentError(URIrecordingAnno,
                                         outputHTKPhoneAlignedURI, EVALLEVEL)

    mean, stDev, median = getMeanAndStDevError(alignmentErrors)

    print "(", mean, ",", stDev, ")"

    ### OPTIONAL : open in praat
    withDuration = False
    visualiseInPraat(URIrecordingNOExt, withDuration, outputHTKPhoneAlignedURI,
                     [])

    return mean, stDev, alignmentErrors
Esempio n. 13
0
def preprocess(amr_file, START_SNLP=True):
    '''nasty function'''
    aligned_amr_file = amr_file + '.aligned'
    if os.path.exists(aligned_amr_file):
        comments, amr_strings = readAMR(aligned_amr_file)
    else:
        comments, amr_strings = readAMR(amr_file)
    #comments,amr_strings = readAMR(aligned_amr_file)
    sentences = [c['snt'] for c in comments]
    tmp_sentence_file = amr_file + '.sent'
    if not os.path.exists(tmp_sentence_file):
        _write_sentences(tmp_sentence_file, sentences)

    print >> log, "pos, ner and dependency..."
    proc = StanfordCoreNLP()
    if START_SNLP: proc.setup()
    instances = proc.parse(tmp_sentence_file)

    tok_amr_filename = amr_file + '.tok'
    if not os.path.exists(tok_amr_filename):
        _write_tok_amr(tok_amr_filename, amr_file, instances)

    SpanGraph.graphID = 0
    for i in range(len(instances)):

        amr = AMR.parse_string(amr_strings[i])

        alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr, alignment, instances[i].tokens)
        #ggraph.pre_merge_netag(instances[i])
        #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
        instances[i].addAMR(amr)
        instances[i].addGoldGraph(ggraph)

    #print >> log, "adding amr"
    #_add_amr(instances,amr_strings)
    #if writeToFile:
    #    output_file = amr_file.rsplit('.',1)[0]+'_dataInst.p'
    #    pickle.dump(instances,open(output_file,'wb'),pickle.HIGHEST_PROTOCOL)

    return instances
def preprocess(amr_file,START_SNLP=True):
    '''nasty function'''
    aligned_amr_file = amr_file + '.aligned'
    if os.path.exists(aligned_amr_file):
        comments,amr_strings = readAMR(aligned_amr_file)
    else:
        comments,amr_strings = readAMR(amr_file)
    #comments,amr_strings = readAMR(aligned_amr_file)
    sentences = [c['snt'] for c in comments]
    tmp_sentence_file = amr_file+'.sent'
    if not os.path.exists(tmp_sentence_file):
        _write_sentences(tmp_sentence_file,sentences)

    print >> log, "pos, ner and dependency..."
    proc = StanfordCoreNLP()
    if START_SNLP: proc.setup()
    instances = proc.parse(tmp_sentence_file)

    tok_amr_filename = amr_file + '.tok'
    if not os.path.exists(tok_amr_filename):
        _write_tok_amr(tok_amr_filename,amr_file,instances)
    
    SpanGraph.graphID = 0
    for i in range(len(instances)):

        amr = AMR.parse_string(amr_strings[i])
        
        alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
        ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
        #ggraph.pre_merge_netag(instances[i])
        #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
        instances[i].addAMR(amr)
        instances[i].addGoldGraph(ggraph)
        
    #print >> log, "adding amr"
    #_add_amr(instances,amr_strings)
    #if writeToFile:
    #    output_file = amr_file.rsplit('.',1)[0]+'_dataInst.p'
    #    pickle.dump(instances,open(output_file,'wb'),pickle.HIGHEST_PROTOCOL)
        
    return instances
Esempio n. 15
0
MODEL_URI = os.path.abspath('model/hmmdefs9gmm9iter')

import sys


parentDir = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__) ), os.path.pardir)) 

pathHMMDuration = os.path.join(parentDir, 'JingjuAlignment')
if pathHMMDuration not in sys.path:
    sys.path.append(pathHMMDuration)

from lyricsParser import divideIntoSectionsFromAnno, loadLyricsFromTextGridSentence



if __name__ == '__main__':
    
    # LOAD LYRICS
    lyricsTextGrid = 'dan-xipi_01.TextGrid'
    listSentences = divideIntoSectionsFromAnno(lyricsTextGrid)
    lyrics = loadLyricsFromTextGridSentence(listSentences[0])
    
    URIrecordingWav = 'dan-xipi_01_32.511032007_51.9222930007.wav'
    # TODO: generate this TextGrid
    lyricsTextGridSentence = 'dan-xipi_01_32.511032007_51.9222930007.TextGrid'
    withSynthesis = 0
    
    # align
    outputHTKPhoneAlignedURI = Aligner.alignOnechunk(MODEL_URI, URIrecordingWav, lyrics,  lyricsTextGridSentence,  '/tmp/', withSynthesis)
    
import sys
from Aligner import Aligner
from SentenceSplitter import SentenceSplitter
import pickle
import codecs

#UTF8Writer = codecs.getwriter('utf8')
#sys.stdout = UTF8Writer(sys.stdout)
char_stream = codecs.getreader("utf-8")(sys.stdin)
#UTF8Reader = codecs.getreader('utf8')
#sys.stdin = UTF8Reader(sys.stdin)

pkl_file = open('../../dictionaries/dictionary.pkl', 'rb')
lang_dict = pickle.load(pkl_file)
pkl_file.close()
pkl_file = open('../../dictionaries/rev_dictionary.pkl', 'rb')
rev_lang_dict = pickle.load(pkl_file)
pkl_file.close()
aligner = Aligner(lang_dict, rev_lang_dict)
splitter = SentenceSplitter()

for line in char_stream:
    try:
        [sentence, translation] = line.strip().split('\t')
        [sentence, dummy] = splitter.split_sentence(sentence)
        [translation, dummy] = splitter.split_english_sentence(translation)
        aligner.print_dict_alignments(sentence, translation, 5)
    except ValueError:
        pass
Esempio n. 17
0
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR: # the input file is amr annotation
        
        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            comments,amr_strings = readAMR(aligned_amr_file)
        else:
            comments,amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments] # here should be 'snt'
        tmp_sent_filename = amr_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # write sentences into file
            _write_sentences(tmp_sent_filename,sentences)


        print >> log, "Start Stanford CoreNLP..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename): # write tokenized amr file
            _write_tok_amr(tok_amr_filename,amr_file,instances)
            
        SpanGraph.graphID = 0
        for i in range(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    else:
        # input file is sentence
        tmp_sent_filename = input_file 

        print >> log, "Start Stanford CoreNLP ..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = open(dep_filename,'w')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = open(dep_filename,'r').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        dep_filename = tok_sent_filename+'.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = open(dep_filename,'r').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        pass
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename))

        
    return instances
Esempio n. 18
0
def main(aligner_fname, cascade_fname, image_fnames):
    aligner = Aligner(aligner_fname)
    cascade = load_cascade(cascade_fname)
    images = (pvImage(fname) for fname in image_fnames)
    return [aligner.align_face(detect_faces(img, cascade)[0], img) for img in images]
Esempio n. 19
0
def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr',PRP_FORMAT='plain'):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR == 'amr': # the input file is amr annotation
        
        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            comments,amr_strings = readAMR(aligned_amr_file)
        else:
            comments,amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments] # here should be 'snt'

        # write sentences(separate per line)
        tmp_sent_filename = amr_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # no cache found
            _write_sentences(tmp_sent_filename,sentences)

        tmp_prp_filename = None
        instances = None
        if PRP_FORMAT == 'plain':
            tmp_prp_filename = tmp_sent_filename+'.prp'
            
            
            proc1 = StanfordCoreNLP()

            # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

            if START_SNLP and not os.path.exists(tmp_prp_filename):
                print >> log, "Start Stanford CoreNLP..."
                proc1.setup()

            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)            
            instances = proc1.parse(tmp_sent_filename)

        elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore
            tmp_prp_filename = tmp_sent_filename+'.prp.xml'
            if not os.path.exists(tmp_prp_filename):
                raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename)
            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
            instances = load_xml_instances(tmp_prp_filename)
        else:
            raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT)
            
        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename): # write tokenized amr file
            _write_tok_amr(tok_amr_filename,amr_file,instances)
            
        SpanGraph.graphID = 0
        for i in xrange(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                # use verbalization list to fix the unaligned tokens
                if constants.FLAG_VERB: Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment)
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    elif INPUT_AMR == 'amreval':
        eval_file = input_file
        comments = readAMREval(eval_file)
        sentences = [c['snt'] for c in comments] 

        # write sentences(separate per line)
        tmp_sent_filename = eval_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # no cache found
            _write_sentences(tmp_sent_filename,sentences)

        tmp_prp_filename = tmp_sent_filename+'.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP ..."
            proc1.setup()
            instances = proc1.parse(tmp_sent_filename)
        elif os.path.exists(tmp_prp_filename): # found cache file
            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
            instances = proc1.parse(tmp_sent_filename)
        else:
            raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename))
            
        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
            
        for i in xrange(len(instances)):
            instances[i].addComment(comments[i])
        
    else:        # input file is sentence
        tmp_sent_filename = input_file

        tmp_prp_filename = None
        instances = None
        if PRP_FORMAT == 'plain':
            tmp_prp_filename = tmp_sent_filename+'.prp'

            proc1 = StanfordCoreNLP()

            # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

            if START_SNLP and not os.path.exists(tmp_prp_filename):
                print >> log, "Start Stanford CoreNLP..."
                proc1.setup()

            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)            
            instances = proc1.parse(tmp_sent_filename)

        elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore
            tmp_prp_filename = tmp_sent_filename+'.xml'
            if not os.path.exists(tmp_prp_filename):
                raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename)
            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
            instances = load_xml_instances(tmp_prp_filename)
        else:
            raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT)

        
        # tmp_prp_filename = tmp_sent_filename+'.prp'
        # proc1 = StanfordCoreNLP()

        # # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        # if START_SNLP and not os.path.exists(tmp_prp_filename):
        #     print >> log, "Start Stanford CoreNLP ..."
        #     proc1.setup()
        #     instances = proc1.parse(tmp_sent_filename)
        # elif os.path.exists(tmp_prp_filename): # found cache file
        #     print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
        #     instances = proc1.parse(tmp_sent_filename)
        # else:
        #     raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename))
        

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = codecs.open(dep_filename,'w',encoding='utf-8')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        if constants.FLAG_ONTO == 'onto':
            dep_filename = tok_sent_filename+'.charniak.onto.parse.dep'
        elif constants.FLAG_ONTO == 'onto+bolt':
            dep_filename = tok_sent_filename+'.charniak.onto+bolt.parse.dep'
        else:
            dep_filename = tok_sent_filename+'.charniak.parse.dep'            
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        #pass
        raise Exception('Unknown dependency parse type %s' % (constants.FLAG_DEPPARSER))
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise IOError('Semantic role labeling file %s not found!' % (prop_filename))

    if constants.FLAG_RNE:
        print >> log, "Using rich name entity instead..."
        rne_filename = tok_sent_filename + '.rne'
        if os.path.exists(rne_filename):
            _substitute_rne(instances, rne_filename)
        else:
            raise IOError('Rich name entity file %s not found!' % (rne_filename))

        
    return instances
Esempio n. 20
0
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True, align=True, use_amr_tokens=False):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR: # the input file is amr annotation

        amr_file = input_file
        if amr_file.endswith('.amr'):
            aligned_amr_file = amr_file + '.tok.aligned'
            amr_tok_file = amr_file + '.tok'
        else:
            aligned_amr_file = amr_file + '.amr.tok.aligned'
            amr_tok_file = amr_file + '.amr.tok'

        tmp_sent_filename = amr_file+'.sent'
        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file

        comments,amr_strings = readAMR(amr_file)
        if os.path.exists(aligned_amr_file):
            print "Reading aligned AMR ..."
            # read aligned amr and transfer alignment comments
            comments_with_alignment,_ = readAMR(aligned_amr_file)
            for comment,comment_with_alignment in zip(comments,comments_with_alignment):
                comment['alignments'] = comment_with_alignment['alignments']

        tokenized_sentences = None
        try:
            if use_amr_tokens:
                tokenized_sentences = [c['tok'] for c in comments] # here should be 'snt'
                if not os.path.exists(tok_sent_filename):
                    with open(tok_sent_filename,'w') as f:
                        for sentence in tokenized_sentences:
                            print >> f, sentence
                if tokenized_sentences:
                    print >> log, "AMR has tokens, will use them"
        except:
            raise
            pass

        sentences = [c['snt'] for c in comments] # here should be 'snt'
        if not os.path.exists(tmp_sent_filename): # write sentences into file
            _write_sentences(tmp_sent_filename,sentences)

        print >> log, "Start Stanford CoreNLP..."
        proc1 = StanfordCoreNLP(tokenize=not tokenized_sentences)

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()

        instances = proc1.parse(tmp_sent_filename if proc1.tokenize else tok_sent_filename)

        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        if len(instances) == 0:
            print 'Error: no instances!'
            sys.exit(1)

        if not os.path.exists(amr_tok_file): # write tokenized amr file
            _write_tok_amr(amr_tok_file,amr_file,instances)
            
        if not os.path.exists(aligned_amr_file) and align:
            # align
            print "Call JAMR to generate alignment ..."
            subprocess.call('./scripts/jamr_align.sh '+amr_tok_file,shell=True)
            print "Reading aligned AMR ..."
            # read aligned amr and transfer alignment comments
            comments_with_alignment,_ = readAMR(aligned_amr_file)
            for comment,comment_with_alignment in zip(comments,comments_with_alignment):
                comment['alignments'] = comment_with_alignment['alignments']

        from progress import Progress
        p = Progress(len(instances), estimate=True, values=True)
        print 'Parsing AMR:'
        SpanGraph.graphID = 0
        for i in range(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)
            instances[i].addComment(comments[i])
            p += 1
        p.complete()

    else:
        # input file is sentence
        tmp_sent_filename = input_file 

        print >> log, "Start Stanford CoreNLP ..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = open(dep_filename,'w')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = open(dep_filename,'r').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        dep_filename = tok_sent_filename+'.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = open(dep_filename,'r').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        pass
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename))

        
    return instances
Esempio n. 21
0
def preprocess(input_file, START_SNLP=True, INPUT_AMR='amr'):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None

    if INPUT_AMR == 'amr':  # the input file is amr annotation

        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            comments, amr_strings = readAMR(aligned_amr_file)
        else:
            comments, amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments]  # here should be 'snt'

        # write sentences(separate per line)
        tmp_sent_filename = amr_file + '.sent'
        if not os.path.exists(tmp_sent_filename):  # no cache found
            _write_sentences(tmp_sent_filename, sentences)

        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP..."
            proc1.setup()

        print >> log, 'Read token,lemma,name entity file %s...' % (
            tmp_prp_filename)
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename):  # write tokenized amr file
            _write_tok_amr(tok_amr_filename, amr_file, instances)

        SpanGraph.graphID = 0
        for i in xrange(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment, s2c_alignment = Aligner.readJAMRAlignment(
                    amr, comments[i]['alignments'])
                # use verbalization list to fix the unaligned tokens
                if constants.FLAG_VERB:
                    Aligner.postProcessVerbList(amr, comments[i]['tok'],
                                                alignment)
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr, alignment,
                                                      s2c_alignment,
                                                      instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    elif INPUT_AMR == 'amreval':
        eval_file = input_file
        comments = readAMREval(eval_file)
        sentences = [c['snt'] for c in comments]

        # write sentences(separate per line)
        tmp_sent_filename = eval_file + '.sent'
        if not os.path.exists(tmp_sent_filename):  # no cache found
            _write_sentences(tmp_sent_filename, sentences)

        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP ..."
            proc1.setup()
            instances = proc1.parse(tmp_sent_filename)
        elif os.path.exists(tmp_prp_filename):  # found cache file
            print >> log, 'Read token,lemma,name entity file %s...' % (
                tmp_prp_filename)
            instances = proc1.parse(tmp_sent_filename)
        else:
            raise Exception(
                'No cache file %s has been found. set START_SNLP=True to start corenlp.'
                % (tmp_prp_filename))

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

        for i in xrange(len(instances)):
            instances[i].addComment(comments[i])

    else:  # input file is sentence
        tmp_sent_filename = input_file
        tmp_prp_filename = tmp_sent_filename + '.prp'

        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP and not os.path.exists(tmp_prp_filename):
            print >> log, "Start Stanford CoreNLP ..."
            proc1.setup()
            instances = proc1.parse(tmp_sent_filename)
        elif os.path.exists(tmp_prp_filename):  # found cache file
            print >> log, 'Read token,lemma,name entity file %s...' % (
                tmp_prp_filename)
            instances = proc1.parse(tmp_sent_filename)
        else:
            raise Exception(
                'No cache file %s has been found. set START_SNLP=True to start corenlp.'
                % (tmp_prp_filename))

        tok_sent_filename = tmp_sent_filename + '.tok'  # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename, instances)

    # preprocess 2: dependency parsing
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename + '.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = codecs.open(dep_filename, 'r',
                                     encoding='utf-8').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = codecs.open(dep_filename, 'w', encoding='utf-8')
            output_dep.write(dep_result)
            output_dep.close()

        _add_dependency(instances, dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename + '.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = codecs.open(dep_filename, 'r',
                                     encoding='utf-8').read()
        else:
            raise IOError('Converted dependency file %s not founded' %
                          (dep_filename))

        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        if constants.FLAG_ONTO == 'onto':
            dep_filename = tok_sent_filename + '.charniak.onto.parse.dep'
        elif constants.FLAG_ONTO == 'onto+bolt':
            dep_filename = tok_sent_filename + '.charniak.onto+bolt.parse.dep'
        else:
            dep_filename = tok_sent_filename + '.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read()
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename + '.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename + '.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename + '.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)
            dep_result = open(dep_filename, 'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER)
    else:
        #pass
        raise Exception('Unknown dependency parse type %s' %
                        (constants.FLAG_DEPPARSER))

    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,
                          prop_filename,
                          dep_filename,
                          FIX_PROP_HEAD=True)
            else:
                _add_prop(instances, prop_filename, dep_filename)

        else:
            raise IOError('Semantic role labeling file %s not found!' %
                          (prop_filename))

    if constants.FLAG_RNE:
        print >> log, "Using rich name entity instead..."
        rne_filename = tok_sent_filename + '.rne'
        if os.path.exists(rne_filename):
            _substitute_rne(instances, rne_filename)
        else:
            raise IOError('Rich name entity file %s not found!' %
                          (rne_filename))

    return instances
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR: # the input file is amr annotation
        
        amr_file = input_file
        aligned_amr_file = amr_file + '.amr.tok.aligned'
        if os.path.exists(aligned_amr_file):
            print >> log, "Using aligned amr file..."
            comments,amr_strings = readAMR(aligned_amr_file)
        else:
            comments,amr_strings = readAMR(amr_file)
        sentences = [c['snt'] for c in comments] # here should be 'snt'
        tmp_sent_filename = amr_file+'.sent'
        if not os.path.exists(tmp_sent_filename): # write sentences into file
            _write_sentences(tmp_sent_filename,sentences)


        print >> log, "Start Stanford CoreNLP..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        tok_amr_filename = amr_file + '.amr.tok'
        if not os.path.exists(tok_amr_filename): # write tokenized amr file
            _write_tok_amr(tok_amr_filename,amr_file,instances)
            
        SpanGraph.graphID = 0
        for i in range(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addComment(comments[i])
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)

    else:
        # input file is sentence
        tmp_sent_filename = input_file 

        print >> log, "Start Stanford CoreNLP ..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = open(dep_filename,'w')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = open(dep_filename,'r').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        dep_filename = tok_sent_filename+'.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = open(dep_filename,'r').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        pass
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename))

        
    return instances
Esempio n. 23
0
def main(argv):
    
    '''
    command line parameters:
    neofile - text file with neoantigen data (supplementary data)
    alignmentDirectory - folder with precomputed alignments
    a - midpoint parameter of the logistic function, alignment score threshold
    k - slope parameter of the logistic function
    outfile - path to a file where to output neoantigen fitness computation
    '''
        
    neofile=argv[1]
    alignmentDirectory=argv[2]
    a=float(argv[3])
    k=float(argv[4])
    outfile=sys.argv[5]
    xmlpath=sys.argv[6]
 
    [neoantigens,samples]=readNeoantigens(neofile)    
    #Compute TCR-recognition probabilities for all neoantigens
    aligner=Aligner()    
    #for sample in samples:
    #    xmlpath=alignmentDirectory+"/neoantigens_"+sample+"_iedb.xml"
    #    aligner.readAllBlastAlignments(xmlpath)    
    #xmlpath=alignmentDirectory+"/neoantigens_5NSAK_iedb.xml"
    #xmlpath="/scratch/eknodel/cohen_melanoma/validated_peptides/3466/3466_lukszablast.out"
    aligner.readAllBlastAlignments(xmlpath)
    aligner.computeR(a, k)    
    
    #Write neoantigen recognition potential
    of=open(outfile,'w')
    #header=["NeoantigenID","Mutation","Sample","MutatedPeptide","ResidueChangeClass","MutantPeptide","WildtypePeptide","A","R","Excluded","NeoantigenRecognitionPotential"]
    header=["NeoantigenID","Mutation","Sample","MutantPeptide","WildtypePeptide","A","R","w","wc","Fitness"]
    header="\t".join(header)
    of.write(header+"\n")
    for i in neoantigens:
        #print(i)
        neoantigen=neoantigens[i]
        #print(neoantigen)
        w=neoantigen.getHydro() #calculates neoantigen fraction based on Luksza definition
        wc = neoantigen.getConsortiumHydro() #Calcuculates neoantigen fraction based on Consortium's definition (https://doi.org/10.1016/j.cell.2020.09.015)
        A=neoantigens[i].getA() #MHC amplitude A
        #print(A)
        mtpeptide=neoantigens[i].mtPeptide #mutant peptide
        wtpeptide=neoantigens[i].wtPeptide
        R=aligner.getR(i)
        #print(R)        
        
        # Residue change:
        # HH: from hydrophobic to hydrophobic, 
        # NN: from non-hydrophobic to non-hydrophobic
        # HN: from hydrophobic to non-hydrophobic, 
        # NH: from non-hydrophobic to hydrophobic
        # other (WW, WH, HW, NW, WN) which include aminoacids without a clear classification
        #residueChange=neoantigen.residueChange 
        #print(residueChange)
 
        fitnessCost=A*R*w
        #fitnessCost=A*R
        
        #l=[i, neoantigen.mid, neoantigen.sample, neoantigen.position, residueChange, mtpeptide, wtpeptide, A,R, 1-w, fitnessCost]#, neoAlignment, epitopeAlignment, score, species]
        l=[i, neoantigen.mid, neoantigen.sample, mtpeptide, wtpeptide, A,R,w,wc, fitnessCost]#, neoAlignment, epitopeAlignment, score, species]
        l="\t".join(map(lambda s: str(s),l))
        of.write(l+"\n")
Esempio n. 24
0
 def __init__(self,
              print_param=0,
              alignment_type='local',
              gap_scores=[0.2, 0.05]):
     Aligner.__init__(self, print_param, alignment_type, gap_scores)
     # PET91
     self.scoring_matrix = {
         'AA': 15 / 15.0,
         'A-': 6 / 15.0,
         'AR': 7 / 15.0,
         'RR': 15 / 15.0,
         'R-': 6 / 15.0,
         'AN': 9 / 15.0,
         'RN': 8 / 15.0,
         'NN': 15 / 15.0,
         'N-': 6 / 15.0,
         'AD': 8 / 15.0,
         'RD': 6 / 15.0,
         'ND': 13 / 15.0,
         'DD': 15 / 15.0,
         'C-': 6 / 15.0,
         'AC': 7 / 15.0,
         'RC': 8 / 15.0,
         'NC': 7 / 15.0,
         'DC': 3 / 15.0,
         'CC': 15 / 15.0,
         'Q-': 6 / 15.0,
         'AQ': 7 / 15.0,
         'RQ': 12 / 15.0,
         'NQ': 9 / 15.0,
         'DQ': 9 / 15.0,
         'CQ': 4 / 15.0,
         'QQ': 15 / 15.0,
         'E-': 6 / 15.0,
         'AE': 8 / 15.0,
         'RE': 7 / 15.0,
         'NE': 9 / 15.0,
         'DE': 15 / 15.0,
         'CE': 2 / 15.0,
         'QE': 12 / 15.0,
         'EE': 15 / 15.0,
         'G-': 6 / 15.0,
         'AG': 10 / 15.0,
         'RG': 9 / 15.0,
         'NG': 9 / 15.0,
         'DG': 10 / 15.0,
         'CG': 7 / 15.0,
         'QG': 6 / 15.0,
         'EG': 9 / 15.0,
         'GG': 15 / 15.0,
         'H-': 6 / 15.0,
         'AH': 6 / 15.0,
         'RH': 12 / 15.0,
         'NH': 12 / 15.0,
         'DH': 9 / 15.0,
         'CH': 8 / 15.0,
         'QH': 14 / 15.0,
         'EH': 7 / 15.0,
         'GH': 6 / 15.0,
         'HH': 15 / 15.0,
         'I-': 6 / 15.0,
         'AI': 9 / 15.0,
         'RI': 4 / 15.0,
         'NI': 6 / 15.0,
         'DI': 3 / 15.0,
         'CI': 5 / 15.0,
         'QI': 4 / 15.0,
         'EI': 3 / 15.0,
         'GI': 4 / 15.0,
         'HI': 4 / 15.0,
         'II': 15 / 15.0,
         'L-': 6 / 15.0,
         'AL': 6 / 15.0,
         'RL': 5 / 15.0,
         'NL': 4 / 15.0,
         'DL': 2 / 15.0,
         'CL': 5 / 15.0,
         'QL': 7 / 15.0,
         'EL': 2 / 15.0,
         'GL': 2 / 15.0,
         'HL': 6 / 15.0,
         'IL': 12 / 15.0,
         'LL': 15 / 15.0,
         'K-': 6 / 15.0,
         'AK': 6 / 15.0,
         'RK': 14 / 15.0,
         'NK': 11 / 15.0,
         'DK': 12 / 15.0,
         'CK': 4 / 15.0,
         'QK': 12 / 15.0,
         'EK': 10 / 15.0,
         'GK': 6 / 15.0,
         'HK': 9 / 15.0,
         'IK': 4 / 15.0,
         'LK': 4 / 15.0,
         'KK': 15 / 15.0,
         'M-': 6 / 15.0,
         'AM': 8 / 15.0,
         'RM': 6 / 15.0,
         'NM': 5 / 15.0,
         'DM': 3 / 15.0,
         'CM': 5 / 15.0,
         'QM': 6 / 15.0,
         'EM': 4 / 15.0,
         'GM': 4 / 15.0,
         'HM': 5 / 15.0,
         'IM': 13 / 15.0,
         'LM': 13 / 15.0,
         'KM': 6 / 15.0,
         'MM': 15 / 15.0,
         'F-': 6 / 15.0,
         'AF': 4 / 15.0,
         'RF': 2 / 15.0,
         'NF': 4 / 15.0,
         'DF': 1 / 15.0,
         'CF': 8 / 15.0,
         'QF': 3 / 15.0,
         'EF': 0 / 15.0,
         'GF': 1 / 15.0,
         'HF': 8 / 15.0,
         'IF': 9 / 15.0,
         'LF': 12 / 15.0,
         'KF': 1 / 15.0,
         'MF': 8 / 15.0,
         'FF': 15 / 15.0,
         'P-': 6 / 15.0,
         'AP': 10 / 15.0,
         'RP': 8 / 15.0,
         'NP': 7 / 15.0,
         'DP': 5 / 15.0,
         'CP': 5 / 15.0,
         'QP': 10 / 15.0,
         'EP': 5 / 15.0,
         'GP': 6 / 15.0,
         'HP': 9 / 15.0,
         'IP': 5 / 15.0,
         'LP': 8 / 15.0,
         'KP': 6 / 15.0,
         'MP': 5 / 15.0,
         'FP': 5 / 15.0,
         'PP': 15 / 15.0,
         'S-': 6 / 15.0,
         'AS': 12 / 15.0,
         'RS': 8 / 15.0,
         'NS': 12 / 15.0,
         'DS': 8 / 15.0,
         'CS': 10 / 15.0,
         'QS': 7 / 15.0,
         'ES': 7 / 15.0,
         'GS': 10 / 15.0,
         'HS': 8 / 15.0,
         'IS': 7 / 15.0,
         'LS': 7 / 15.0,
         'KS': 7 / 15.0,
         'MS': 7 / 15.0,
         'FS': 7 / 15.0,
         'PS': 11 / 15.0,
         'SS': 15 / 15.0,
         'T-': 6 / 15.0,
         'AT': 12 / 15.0,
         'RT': 7 / 15.0,
         'NT': 11 / 15.0,
         'DT': 7 / 15.0,
         'CT': 7 / 15.0,
         'QT': 7 / 15.0,
         'ET': 6 / 15.0,
         'GT': 8 / 15.0,
         'HT': 7 / 15.0,
         'IT': 10 / 15.0,
         'LT': 6 / 15.0,
         'KT': 8 / 15.0,
         'MT': 10 / 15.0,
         'FT': 4 / 15.0,
         'PT': 10 / 15.0,
         'ST': 12 / 15.0,
         'TT': 15 / 15.0,
         'W-': 6 / 15.0,
         'AW': 2 / 15.0,
         'RW': 9 / 15.0,
         'NW': 2 / 15.0,
         'DW': 1 / 15.0,
         'CW': 10 / 15.0,
         'QW': 4 / 15.0,
         'EW': 1 / 15.0,
         'GW': 7 / 15.0,
         'HW': 4 / 15.0,
         'IW': 3 / 15.0,
         'LW': 6 / 15.0,
         'KW': 4 / 15.0,
         'MW': 4 / 15.0,
         'FW': 7 / 15.0,
         'PW': 2 / 15.0,
         'SW': 5 / 15.0,
         'TW': 3 / 15.0,
         'WW': 15 / 15.0,
         'Y-': 6 / 15.0,
         'AY': 3 / 15.0,
         'RY': 5 / 15.0,
         'NY': 8 / 15.0,
         'DY': 6 / 15.0,
         'CY': 12 / 15.0,
         'QY': 6 / 15.0,
         'EY': 3 / 15.0,
         'GY': 2 / 15.0,
         'HY': 14 / 15.0,
         'IY': 5 / 15.0,
         'LY': 6 / 15.0,
         'KY': 3 / 15.0,
         'MY': 4 / 15.0,
         'FY': 15 / 15.0,
         'PY': 3 / 15.0,
         'SY': 7 / 15.0,
         'TY': 4 / 15.0,
         'WY': 8 / 15.0,
         'YY': 15 / 15.0,
         'V-': 6 / 15.0,
         'AV': 11 / 15.0,
         'RV': 4 / 15.0,
         'NV': 5 / 15.0,
         'DV': 5 / 15.0,
         'CV': 7 / 15.0,
         'QV': 4 / 15.0,
         'EV': 5 / 15.0,
         'GV': 6 / 15.0,
         'HV': 4 / 15.0,
         'IV': 15 / 15.0,
         'LV': 11 / 15.0,
         'KV': 4 / 15.0,
         'MV': 12 / 15.0,
         'FV': 8 / 15.0,
         'PV': 7 / 15.0,
         'SV': 7 / 15.0,
         'TV': 10 / 15.0,
         'WV': 4 / 15.0,
         'YV': 4 / 15.0,
         'VV': 15 / 15.0
     }
     for key in self.scoring_matrix.keys():
         if key[1] + key[0] not in self.scoring_matrix.keys():
             self.scoring_matrix[key[1] +
                                 key[0]] = self.scoring_matrix[key[0] +
                                                               key[1]]