def __init__(self, print_param=0, alignment_type='local', gap_scores=[0.2,0.05]): Aligner.__init__(self, print_param, alignment_type, gap_scores) # PET91 self.scoring_matrix = {'AA':15/15.0, 'A-':6/15.0, 'AR': 7/15.0, 'RR':15/15.0, 'R-':6/15.0, 'AN': 9/15.0, 'RN': 8/15.0, 'NN':15/15.0, 'N-':6/15.0, 'AD': 8/15.0, 'RD': 6/15.0, 'ND':13/15.0, 'DD':15/15.0, 'C-': 6/15.0, 'AC': 7/15.0, 'RC': 8/15.0, 'NC': 7/15.0, 'DC': 3/15.0, 'CC':15/15.0, 'Q-': 6/15.0, 'AQ': 7/15.0, 'RQ':12/15.0, 'NQ': 9/15.0, 'DQ': 9/15.0, 'CQ': 4/15.0, 'QQ':15/15.0, 'E-': 6/15.0, 'AE': 8/15.0, 'RE': 7/15.0, 'NE': 9/15.0, 'DE':15/15.0, 'CE': 2/15.0, 'QE':12/15.0, 'EE':15/15.0, 'G-': 6/15.0, 'AG':10/15.0, 'RG': 9/15.0, 'NG': 9/15.0, 'DG':10/15.0, 'CG': 7/15.0, 'QG': 6/15.0, 'EG': 9/15.0, 'GG':15/15.0, 'H-': 6/15.0, 'AH': 6/15.0, 'RH':12/15.0, 'NH':12/15.0, 'DH': 9/15.0, 'CH': 8/15.0, 'QH':14/15.0, 'EH': 7/15.0, 'GH': 6/15.0, 'HH':15/15.0, 'I-': 6/15.0, 'AI': 9/15.0, 'RI': 4/15.0, 'NI': 6/15.0, 'DI': 3/15.0, 'CI': 5/15.0, 'QI': 4/15.0, 'EI': 3/15.0, 'GI': 4/15.0, 'HI': 4/15.0, 'II':15/15.0, 'L-': 6/15.0, 'AL': 6/15.0, 'RL': 5/15.0, 'NL': 4/15.0, 'DL': 2/15.0, 'CL': 5/15.0, 'QL': 7/15.0, 'EL': 2/15.0, 'GL': 2/15.0, 'HL': 6/15.0, 'IL':12/15.0, 'LL':15/15.0, 'K-': 6/15.0, 'AK': 6/15.0, 'RK':14/15.0, 'NK':11/15.0, 'DK':12/15.0, 'CK': 4/15.0, 'QK':12/15.0, 'EK':10/15.0, 'GK': 6/15.0, 'HK': 9/15.0, 'IK': 4/15.0, 'LK': 4/15.0, 'KK':15/15.0, 'M-': 6/15.0, 'AM': 8/15.0, 'RM': 6/15.0, 'NM': 5/15.0, 'DM': 3/15.0, 'CM': 5/15.0, 'QM': 6/15.0, 'EM': 4/15.0, 'GM': 4/15.0, 'HM': 5/15.0, 'IM':13/15.0, 'LM':13/15.0, 'KM': 6/15.0, 'MM':15/15.0, 'F-': 6/15.0, 'AF': 4/15.0, 'RF': 2/15.0, 'NF': 4/15.0, 'DF': 1/15.0, 'CF': 8/15.0, 'QF': 3/15.0, 'EF': 0/15.0, 'GF': 1/15.0, 'HF': 8/15.0, 'IF': 9/15.0, 'LF':12/15.0, 'KF': 1/15.0, 'MF': 8/15.0, 'FF':15/15.0, 'P-': 6/15.0, 'AP':10/15.0, 'RP': 8/15.0, 'NP': 7/15.0, 'DP': 5/15.0, 'CP': 5/15.0, 'QP':10/15.0, 'EP': 5/15.0, 'GP': 6/15.0, 'HP': 9/15.0, 'IP': 5/15.0, 'LP': 8/15.0, 'KP': 6/15.0, 'MP': 5/15.0, 'FP': 5/15.0, 'PP':15/15.0, 'S-': 6/15.0, 'AS':12/15.0, 'RS': 8/15.0, 'NS':12/15.0, 'DS': 8/15.0, 'CS':10/15.0, 'QS': 7/15.0, 'ES': 7/15.0, 'GS':10/15.0, 'HS': 8/15.0, 'IS': 7/15.0, 'LS': 7/15.0, 'KS': 7/15.0, 'MS': 7/15.0, 'FS': 7/15.0, 'PS':11/15.0, 'SS':15/15.0, 'T-': 6/15.0, 'AT':12/15.0, 'RT': 7/15.0, 'NT':11/15.0, 'DT': 7/15.0, 'CT': 7/15.0, 'QT': 7/15.0, 'ET': 6/15.0, 'GT': 8/15.0, 'HT': 7/15.0, 'IT':10/15.0, 'LT': 6/15.0, 'KT': 8/15.0, 'MT':10/15.0, 'FT': 4/15.0, 'PT':10/15.0, 'ST':12/15.0, 'TT':15/15.0, 'W-': 6/15.0, 'AW': 2/15.0, 'RW': 9/15.0, 'NW': 2/15.0, 'DW': 1/15.0, 'CW':10/15.0, 'QW': 4/15.0, 'EW': 1/15.0, 'GW': 7/15.0, 'HW': 4/15.0, 'IW': 3/15.0, 'LW': 6/15.0, 'KW': 4/15.0, 'MW': 4/15.0, 'FW': 7/15.0, 'PW': 2/15.0, 'SW': 5/15.0, 'TW': 3/15.0, 'WW':15/15.0, 'Y-': 6/15.0, 'AY': 3/15.0, 'RY': 5/15.0, 'NY': 8/15.0, 'DY': 6/15.0, 'CY':12/15.0, 'QY': 6/15.0, 'EY': 3/15.0, 'GY': 2/15.0, 'HY':14/15.0, 'IY': 5/15.0, 'LY': 6/15.0, 'KY': 3/15.0, 'MY': 4/15.0, 'FY':15/15.0, 'PY': 3/15.0, 'SY': 7/15.0, 'TY': 4/15.0, 'WY': 8/15.0, 'YY':15/15.0, 'V-': 6/15.0, 'AV':11/15.0, 'RV': 4/15.0, 'NV': 5/15.0, 'DV': 5/15.0, 'CV': 7/15.0, 'QV': 4/15.0, 'EV': 5/15.0, 'GV': 6/15.0, 'HV': 4/15.0, 'IV':15/15.0, 'LV':11/15.0, 'KV': 4/15.0, 'MV':12/15.0, 'FV': 8/15.0, 'PV': 7/15.0, 'SV': 7/15.0, 'TV':10/15.0, 'WV': 4/15.0, 'YV': 4/15.0, 'VV':15/15.0} for key in self.scoring_matrix.keys(): if key[1]+key[0] not in self.scoring_matrix.keys(): self.scoring_matrix[key[1]+key[0]] = self.scoring_matrix[key[0]+key[1]]
def main(aligner_fname, cascade_fname, image_fnames): aligner = Aligner(aligner_fname) cascade = load_cascade(cascade_fname) images = (pvImage(fname) for fname in image_fnames) return [ aligner.align_face(detect_faces(img, cascade)[0], img) for img in images ]
def alignDependingOnWithDuration(URIrecordingNoExt, whichSection, pathToComposition, withDuration, withSynthesis, evalLevel, params, usePersistentFiles, htkParser): ''' call alignment method depending on whether duration or htk selected ''' Phonetizer.initLookupTable(withSynthesis) tokenLevelAlignedSuffix, phonemesAlignedSuffix = determineSuffix(withDuration, withSynthesis, evalLevel) if withDuration: alignmentErrors, detectedWordList, grTruthDurationWordList = alignOneChunk(URIrecordingNoExt, pathToComposition, whichSection, htkParser, params, evalLevel, usePersistentFiles) else: URIrecordingAnno = URIrecordingNoExt + ANNOTATION_EXT URIrecordingWav = URIrecordingNoExt + AUDIO_EXTENSION # new makamScore used lyricsObj = loadLyrics(pathToComposition, whichSection) lyrics = lyricsObj.__str__() # in case we are at no-lyrics section if not lyrics or lyrics =='_SAZ_': logger.warn("skipping section {} with no lyrics ...".format(whichSection)) return [], [], [], [] outputHTKPhoneAlignedURI = Aligner.alignOnechunk(MODEL_URI, URIrecordingWav, lyrics.__str__(), URIrecordingAnno, '/tmp/', withSynthesis) alignmentErrors = evalAlignmentError(URIrecordingAnno, outputHTKPhoneAlignedURI, evalLevel) detectedWordList = outputHTKPhoneAlignedURI grTruthDurationWordList = [] # store decoding results in a file FIXME: if with duration it is not mlf detectedAlignedfileName = [] detectedAlignedfileName = tokenList2TabFile(detectedWordList, URIrecordingNoExt, tokenLevelAlignedSuffix) return alignmentErrors, detectedWordList, grTruthDurationWordList, detectedAlignedfileName
def alignOneChunk(pathToHtkModel, path_TO_OUTPUT, lyrics, currPathToAudioFile, isLyricsFromFile, withSynthesis): if not (os.path.isdir(path_TO_OUTPUT)): os.mkdir(path_TO_OUTPUT) chunkAligner = Aligner(pathToHtkModel, currPathToAudioFile, lyrics, isLyricsFromFile, withSynthesis) baseNameAudioFile = os.path.splitext( os.path.basename(chunkAligner.pathToAudioFile))[0] outputHTKPhoneAlignedURI = os.path.join( path_TO_OUTPUT, baseNameAudioFile) + HTK_MLF_ALIGNED_SUFFIX chunkAligner.alignAudio(0, path_TO_OUTPUT, outputHTKPhoneAlignedURI) return outputHTKPhoneAlignedURI
def alignDependingOnWithDuration(URIrecordingNoExt, sectionLink, pathToComposition, withDuration, withSynthesis, evalLevel, params, usePersistentFiles, htkParser): ''' call alignment method depending on whether duration or htk selected ''' #### 1) load lyrics makamScore = loadMakamScore(pathToComposition) lyrics = makamScore.getLyricsForSection(sectionLink.melodicStructure) lyricsStr = lyrics.__str__() if not lyricsStr or lyricsStr=='None' or lyricsStr =='_SAZ_': logger.warn("skipping sectionLink {} with no lyrics ...".format(sectionLink.melodicStructure)) return [], 'dummy', 0, 0, 0 ############## ## reference duration # correctDurationScoreDev, totalDuration = getReferenceDurations(URIrecordingNoExt, lyricsWithModels, evalLevel) correctDurationScoreDev = 0 tokenLevelAlignedSuffix, phonemesAlignedSuffix = determineSuffix(withDuration, withSynthesis, evalLevel) alignmentErrors = [] if withDuration: withOracle = 0 oracleLyrics = 'dummy' detectedTokenList, detectedPath, maxPhiScore = alignOneChunk( lyrics, withSynthesis, withOracle, oracleLyrics, [], params.ALPHA, usePersistentFiles, tokenLevelAlignedSuffix, URIrecordingNoExt, sectionLink, htkParser) logger.debug('maxPhiScore: ' + str(maxPhiScore) ) correctDuration = 0 totalDuration = 1 # correctDuration, totalDuration = _evalAccuracy(URIrecordingNoExt + ANNOTATION_EXT, detectedTokenList, evalLevel ) # detectedTokenList = test_oracle(URIrecordingNoExt, pathToComposition, whichSection) else: URIrecordingAnno = URIrecordingNoExt + ANNOTATION_EXT URIrecordingWav = URIrecordingNoExt + AUDIO_EXTENSION # new makamScore used # lyricsObj = loadLyrics(pathToComposition, whichSection) # lyrics = lyricsObj.__str__() # # in case we are at no-lyrics sectionLink # if not lyrics or lyrics=='None' or lyrics =='_SAZ_': # logger.warn("skipping sectionLink {} with no lyrics ...".format(whichSection)) # return [], [], [], [] outputHTKPhoneAlignedURI = Aligner.alignOnechunk(MODEL_URI, URIrecordingWav, lyricsStr, URIrecordingAnno, '/tmp/', withSynthesis) alignmentErrors = [] alignmentErrors = evalAlignmentError(URIrecordingAnno, outputHTKPhoneAlignedURI, evalLevel) detectedTokenList = outputHTKPhoneAlignedURI # correctDuration, totalDuration = evalAccuracy(URIrecordingAnno, outputHTKPhoneAlignedURI, evalLevel) return alignmentErrors, correctDuration, totalDuration, correctDurationScoreDev, maxPhiScore
def add_JAMR_align(instances,aligned_amr_file): comments,amr_strings = readAMR(aligned_amr_file) for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph)
def add_JAMR_align(instances, aligned_amr_file): comments, amr_strings = readAMR(aligned_amr_file) for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments']) ggraph = SpanGraph.init_ref_graph(amr, alignment, instances[i].tokens) ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph)
def main(epitope_list=None, alignments_dir=None, alignment_score_threshold=None, slope_parameter=None, output_file=None): # Compute MHC amplitudes for all neoantigens a_val_by_index = {} peptide_by_index = {} sample_by_index = {} with open(epitope_list) as f: for data in csv.DictReader(f, delimiter='\t'): index = data['id'] sample = data['sample'] mtpeptide = data['epitope'] kdwt = data['wt_score'] kdmt = data['mt_score'] kdmt = float(kdmt) if kdwt == 'nan': kdwt = 1000. kdwt = float(kdwt) index = int(index) peptide_by_index[index] = mtpeptide.upper() a_val_by_index[index] = kdwt / kdmt sample_by_index[index] = sample # Compute TCR-recognition probabilities for all neoantigens aligner = Aligner() for sname in set(sample_by_index.values()): xml_path = join(alignments_dir, f'neoantigens_{sname}_iedb.xml') aligner.read_all_blast_alignments(xml_path) aligner.compute_rval(alignment_score_threshold, slope_parameter) # Compute qualities for all epitopes and write the result with open(output_file, 'w') as out: header = [ 'Sample', 'NeoantigenID', 'MT.Peptide.Form', 'NeoantigenQuality', 'NeoantigenAlignment', 'IEDB_EpitopeAlignment', 'AlignmentScore', 'IEDB_Epitope' ] out.write('\t'.join(header) + '\n') for index, peptide in peptide_by_index.items(): a_val = a_val_by_index[index] [r_val, species, alignment] = aligner.get_rval(index) neo_alignment = alignment[0] epitope_alignment = alignment[1] score = alignment[2] quality = a_val * r_val res = [ sample_by_index[index], index, peptide, quality, neo_alignment, epitope_alignment, score, species ] out.write('\t'.join(map(str, res)) + '\n')
def main(argv): ''' command line parameters: neofile - text file with neoantigen data (supplementary data) alignmentDirectory - folder with precomputed alignments a - midpoint parameter of the logistic function, alignment score threshold k - slope parameter of the logistic function outfile - path to a file where to output neoantigen fitness computation ''' neofile=argv[1] alignmentDirectory=argv[2] a=float(argv[3]) k=float(argv[4]) outfile=sys.argv[5] nmerl=float(argv[6]) [neoantigens,samples]=readNeoantigens(neofile, nmerl) #Compute TCR-recognition probabilities for all neoantigens aligner=Aligner() for sample in samples: xmlpath=alignmentDirectory+"/neoantigens_"+sample+"_iedb.xml" aligner.readAllBlastAlignments(xmlpath) aligner.computeR(a, k) #Write neoantigen recognition potential of=open(outfile,'w') header=["NeoantigenID","Mutation","Sample","MutatedPeptide","ResidueChangeClass","MutantPeptide","WildtypePeptide","A","R","Excluded","NeoantigenRecognitionPotential"] header="\t".join(header) of.write(header+"\n") for i in neoantigens: neoantigen=neoantigens[i] w=neoantigen.getWeight() #excludes neoantigens that mutated from a nonhydrophobic residue on position 2 or 9 A=neoantigens[i].getA() #MHC amplitude A mtpeptide=neoantigens[i].mtPeptide #mutant peptide wtpeptide=neoantigens[i].wtPeptide R=aligner.getR(i) # Residue change: # HH: from hydrophobic to hydrophobic, # NN: from non-hydrophobic to non-hydrophobic # HN: from hydrophobic to non-hydrophobic, # NH: from non-hydrophobic to hydrophobic # other (WW, WH, HW, NW, WN) which include aminoacids without a clear classification residueChange=neoantigen.residueChange fitnessCost=A*R*w l=[i, neoantigen.mid, neoantigen.sample, neoantigen.position, residueChange, mtpeptide, wtpeptide, A,R, 1-w, fitnessCost]#, neoAlignment, epitopeAlignment, score, species] l="\t".join(map(lambda s: str(s),l)) of.write(l+"\n")
def _init_instances(sent_file, amr_strings, comments): print >> log, "Preprocess 1:pos, ner and dependency using stanford parser..." proc = StanfordCoreNLP() instances = proc.parse(sent_file) print >> log, "Preprocess 2:adding amr and generating gold graph" assert len(instances) == len(amr_strings) for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) instances[i].addAMR(amr) alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments']) ggraph = SpanGraph.init_ref_graph(amr, alignment, comments[i]['snt']) ggraph.pre_merge_netag(instances[i]) instances[i].addGoldGraph(ggraph) return instances
def _init_instances(sent_file,amr_strings,comments): print >> log, "Preprocess 1:pos, ner and dependency using stanford parser..." proc = StanfordCoreNLP() instances = proc.parse(sent_file) print >> log, "Preprocess 2:adding amr and generating gold graph" assert len(instances) == len(amr_strings) for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) instances[i].addAMR(amr) alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) ggraph = SpanGraph.init_ref_graph(amr,alignment,comments[i]['snt']) ggraph.pre_merge_netag(instances[i]) instances[i].addGoldGraph(ggraph) return instances
def main(argv): if len(argv) != 4: print( "usage: {} <pathToComposition> <whichSection> <URI_recording_no_ext>" .format(argv[0])) sys.exit() URIrecordingNOExt = '/Users/joro/Documents/Phd/UPF/adaptation_data_soloVoice/ISTANBUL/goekhan/02_Gel_3_zemin' URIrecordingNOExt = argv[3] URIrecordingWav = URIrecordingNOExt + AUDIO_EXTENSION pathToComposition = '/Users/joro/Documents/Phd/UPF/adaptation_data_soloVoice/nihavent--sarki--aksak--gel_guzelim--faiz_kapanci/' pathToComposition = argv[1] whichSection = 3 whichSection = int(argv[2]) lyrics = loadLyrics(pathToComposition, whichSection) withSynthesis = 1 URIrecordingAnno = URIrecordingNOExt + PHRASE_ANNOTATION_EXT outputHTKPhoneAlignedURI = Aligner.alignOnechunk(MODEL_URI, URIrecordingWav, lyrics, URIrecordingAnno, '/tmp/', withSynthesis) EVALLEVEL = 2 alignmentErrors = evalAlignmentError(URIrecordingAnno, outputHTKPhoneAlignedURI, EVALLEVEL) mean, stDev, median = getMeanAndStDevError(alignmentErrors) print "(", mean, ",", stDev, ")" ### OPTIONAL : open in praat withDuration = False visualiseInPraat(URIrecordingNOExt, withDuration, outputHTKPhoneAlignedURI, []) return mean, stDev, alignmentErrors
def preprocess(amr_file, START_SNLP=True): '''nasty function''' aligned_amr_file = amr_file + '.aligned' if os.path.exists(aligned_amr_file): comments, amr_strings = readAMR(aligned_amr_file) else: comments, amr_strings = readAMR(amr_file) #comments,amr_strings = readAMR(aligned_amr_file) sentences = [c['snt'] for c in comments] tmp_sentence_file = amr_file + '.sent' if not os.path.exists(tmp_sentence_file): _write_sentences(tmp_sentence_file, sentences) print >> log, "pos, ner and dependency..." proc = StanfordCoreNLP() if START_SNLP: proc.setup() instances = proc.parse(tmp_sentence_file) tok_amr_filename = amr_file + '.tok' if not os.path.exists(tok_amr_filename): _write_tok_amr(tok_amr_filename, amr_file, instances) SpanGraph.graphID = 0 for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) alignment = Aligner.readJAMRAlignment(amr, comments[i]['alignments']) ggraph = SpanGraph.init_ref_graph(amr, alignment, instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) #print >> log, "adding amr" #_add_amr(instances,amr_strings) #if writeToFile: # output_file = amr_file.rsplit('.',1)[0]+'_dataInst.p' # pickle.dump(instances,open(output_file,'wb'),pickle.HIGHEST_PROTOCOL) return instances
def preprocess(amr_file,START_SNLP=True): '''nasty function''' aligned_amr_file = amr_file + '.aligned' if os.path.exists(aligned_amr_file): comments,amr_strings = readAMR(aligned_amr_file) else: comments,amr_strings = readAMR(amr_file) #comments,amr_strings = readAMR(aligned_amr_file) sentences = [c['snt'] for c in comments] tmp_sentence_file = amr_file+'.sent' if not os.path.exists(tmp_sentence_file): _write_sentences(tmp_sentence_file,sentences) print >> log, "pos, ner and dependency..." proc = StanfordCoreNLP() if START_SNLP: proc.setup() instances = proc.parse(tmp_sentence_file) tok_amr_filename = amr_file + '.tok' if not os.path.exists(tok_amr_filename): _write_tok_amr(tok_amr_filename,amr_file,instances) SpanGraph.graphID = 0 for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) #print >> log, "adding amr" #_add_amr(instances,amr_strings) #if writeToFile: # output_file = amr_file.rsplit('.',1)[0]+'_dataInst.p' # pickle.dump(instances,open(output_file,'wb'),pickle.HIGHEST_PROTOCOL) return instances
MODEL_URI = os.path.abspath('model/hmmdefs9gmm9iter') import sys parentDir = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__) ), os.path.pardir)) pathHMMDuration = os.path.join(parentDir, 'JingjuAlignment') if pathHMMDuration not in sys.path: sys.path.append(pathHMMDuration) from lyricsParser import divideIntoSectionsFromAnno, loadLyricsFromTextGridSentence if __name__ == '__main__': # LOAD LYRICS lyricsTextGrid = 'dan-xipi_01.TextGrid' listSentences = divideIntoSectionsFromAnno(lyricsTextGrid) lyrics = loadLyricsFromTextGridSentence(listSentences[0]) URIrecordingWav = 'dan-xipi_01_32.511032007_51.9222930007.wav' # TODO: generate this TextGrid lyricsTextGridSentence = 'dan-xipi_01_32.511032007_51.9222930007.TextGrid' withSynthesis = 0 # align outputHTKPhoneAlignedURI = Aligner.alignOnechunk(MODEL_URI, URIrecordingWav, lyrics, lyricsTextGridSentence, '/tmp/', withSynthesis)
import sys from Aligner import Aligner from SentenceSplitter import SentenceSplitter import pickle import codecs #UTF8Writer = codecs.getwriter('utf8') #sys.stdout = UTF8Writer(sys.stdout) char_stream = codecs.getreader("utf-8")(sys.stdin) #UTF8Reader = codecs.getreader('utf8') #sys.stdin = UTF8Reader(sys.stdin) pkl_file = open('../../dictionaries/dictionary.pkl', 'rb') lang_dict = pickle.load(pkl_file) pkl_file.close() pkl_file = open('../../dictionaries/rev_dictionary.pkl', 'rb') rev_lang_dict = pickle.load(pkl_file) pkl_file.close() aligner = Aligner(lang_dict, rev_lang_dict) splitter = SentenceSplitter() for line in char_stream: try: [sentence, translation] = line.strip().split('\t') [sentence, dummy] = splitter.split_sentence(sentence) [translation, dummy] = splitter.split_english_sentence(translation) aligner.print_dict_alignments(sentence, translation, 5) except ValueError: pass
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True): '''nasty function''' tmp_sent_filename = None instances = None tok_sent_filename = None if INPUT_AMR: # the input file is amr annotation amr_file = input_file aligned_amr_file = amr_file + '.amr.tok.aligned' if os.path.exists(aligned_amr_file): comments,amr_strings = readAMR(aligned_amr_file) else: comments,amr_strings = readAMR(amr_file) sentences = [c['snt'] for c in comments] # here should be 'snt' tmp_sent_filename = amr_file+'.sent' if not os.path.exists(tmp_sent_filename): # write sentences into file _write_sentences(tmp_sent_filename,sentences) print >> log, "Start Stanford CoreNLP..." proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) tok_amr_filename = amr_file + '.amr.tok' if not os.path.exists(tok_amr_filename): # write tokenized amr file _write_tok_amr(tok_amr_filename,amr_file,instances) SpanGraph.graphID = 0 for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) if 'alignments' in comments[i]: alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addComment(comments[i]) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) else: # input file is sentence tmp_sent_filename = input_file print >> log, "Start Stanford CoreNLP ..." proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) # preprocess 2: dependency parsing if constants.FLAG_DEPPARSER == "stanford": dep_filename = tok_sent_filename+'.stanford.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = StanfordDepParser() dep_result = dparser.parse(tok_sent_filename) output_dep = open(dep_filename,'w') output_dep.write(dep_result) output_dep.close() _add_dependency(instances,dep_result) elif constants.FLAG_DEPPARSER == "stanfordConvert": dep_filename = tok_sent_filename+'.stanford.parse.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: raise IOError('Converted dependency file %s not founded' % (dep_filename)) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": dep_filename = tok_sent_filename+'.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) #raise IOError('Converted dependency file %s not founded' % (dep_filename)) print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "clear": dep_filename = tok_sent_filename+'.clear.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = ClearDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "turbo": dep_filename = tok_sent_filename+'.turbo.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = TurboDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "mate": dep_filename = tok_sent_filename+'.mate.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = MateDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) else: pass if constants.FLAG_PROP: print >> log, "Adding SRL information..." prop_filename = tok_sent_filename + '.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True) else: _add_prop(instances,prop_filename,dep_filename) else: raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename)) return instances
def main(aligner_fname, cascade_fname, image_fnames): aligner = Aligner(aligner_fname) cascade = load_cascade(cascade_fname) images = (pvImage(fname) for fname in image_fnames) return [aligner.align_face(detect_faces(img, cascade)[0], img) for img in images]
def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr',PRP_FORMAT='plain'): '''nasty function''' tmp_sent_filename = None instances = None tok_sent_filename = None if INPUT_AMR == 'amr': # the input file is amr annotation amr_file = input_file aligned_amr_file = amr_file + '.amr.tok.aligned' if os.path.exists(aligned_amr_file): comments,amr_strings = readAMR(aligned_amr_file) else: comments,amr_strings = readAMR(amr_file) sentences = [c['snt'] for c in comments] # here should be 'snt' # write sentences(separate per line) tmp_sent_filename = amr_file+'.sent' if not os.path.exists(tmp_sent_filename): # no cache found _write_sentences(tmp_sent_filename,sentences) tmp_prp_filename = None instances = None if PRP_FORMAT == 'plain': tmp_prp_filename = tmp_sent_filename+'.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP..." proc1.setup() print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore tmp_prp_filename = tmp_sent_filename+'.prp.xml' if not os.path.exists(tmp_prp_filename): raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename) print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename) instances = load_xml_instances(tmp_prp_filename) else: raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) tok_amr_filename = amr_file + '.amr.tok' if not os.path.exists(tok_amr_filename): # write tokenized amr file _write_tok_amr(tok_amr_filename,amr_file,instances) SpanGraph.graphID = 0 for i in xrange(len(instances)): amr = AMR.parse_string(amr_strings[i]) if 'alignments' in comments[i]: alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) # use verbalization list to fix the unaligned tokens if constants.FLAG_VERB: Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addComment(comments[i]) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) elif INPUT_AMR == 'amreval': eval_file = input_file comments = readAMREval(eval_file) sentences = [c['snt'] for c in comments] # write sentences(separate per line) tmp_sent_filename = eval_file+'.sent' if not os.path.exists(tmp_sent_filename): # no cache found _write_sentences(tmp_sent_filename,sentences) tmp_prp_filename = tmp_sent_filename+'.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP ..." proc1.setup() instances = proc1.parse(tmp_sent_filename) elif os.path.exists(tmp_prp_filename): # found cache file print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) else: raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename)) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) for i in xrange(len(instances)): instances[i].addComment(comments[i]) else: # input file is sentence tmp_sent_filename = input_file tmp_prp_filename = None instances = None if PRP_FORMAT == 'plain': tmp_prp_filename = tmp_sent_filename+'.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP..." proc1.setup() print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore tmp_prp_filename = tmp_sent_filename+'.xml' if not os.path.exists(tmp_prp_filename): raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename) print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename) instances = load_xml_instances(tmp_prp_filename) else: raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT) # tmp_prp_filename = tmp_sent_filename+'.prp' # proc1 = StanfordCoreNLP() # # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP # if START_SNLP and not os.path.exists(tmp_prp_filename): # print >> log, "Start Stanford CoreNLP ..." # proc1.setup() # instances = proc1.parse(tmp_sent_filename) # elif os.path.exists(tmp_prp_filename): # found cache file # print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename) # instances = proc1.parse(tmp_sent_filename) # else: # raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename)) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) # preprocess 2: dependency parsing if constants.FLAG_DEPPARSER == "stanford": dep_filename = tok_sent_filename+'.stanford.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read() else: dparser = StanfordDepParser() dep_result = dparser.parse(tok_sent_filename) output_dep = codecs.open(dep_filename,'w',encoding='utf-8') output_dep.write(dep_result) output_dep.close() _add_dependency(instances,dep_result) elif constants.FLAG_DEPPARSER == "stanfordConvert": dep_filename = tok_sent_filename+'.stanford.parse.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read() else: raise IOError('Converted dependency file %s not founded' % (dep_filename)) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": if constants.FLAG_ONTO == 'onto': dep_filename = tok_sent_filename+'.charniak.onto.parse.dep' elif constants.FLAG_ONTO == 'onto+bolt': dep_filename = tok_sent_filename+'.charniak.onto+bolt.parse.dep' else: dep_filename = tok_sent_filename+'.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) #raise IOError('Converted dependency file %s not founded' % (dep_filename)) print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename,'r',encoding='utf-8').read() _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "clear": dep_filename = tok_sent_filename+'.clear.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = ClearDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "turbo": dep_filename = tok_sent_filename+'.turbo.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = TurboDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "mate": dep_filename = tok_sent_filename+'.mate.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = MateDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) else: #pass raise Exception('Unknown dependency parse type %s' % (constants.FLAG_DEPPARSER)) if constants.FLAG_PROP: print >> log, "Adding SRL information..." prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True) else: _add_prop(instances,prop_filename,dep_filename) else: raise IOError('Semantic role labeling file %s not found!' % (prop_filename)) if constants.FLAG_RNE: print >> log, "Using rich name entity instead..." rne_filename = tok_sent_filename + '.rne' if os.path.exists(rne_filename): _substitute_rne(instances, rne_filename) else: raise IOError('Rich name entity file %s not found!' % (rne_filename)) return instances
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True, align=True, use_amr_tokens=False): '''nasty function''' tmp_sent_filename = None instances = None tok_sent_filename = None if INPUT_AMR: # the input file is amr annotation amr_file = input_file if amr_file.endswith('.amr'): aligned_amr_file = amr_file + '.tok.aligned' amr_tok_file = amr_file + '.tok' else: aligned_amr_file = amr_file + '.amr.tok.aligned' amr_tok_file = amr_file + '.amr.tok' tmp_sent_filename = amr_file+'.sent' tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file comments,amr_strings = readAMR(amr_file) if os.path.exists(aligned_amr_file): print "Reading aligned AMR ..." # read aligned amr and transfer alignment comments comments_with_alignment,_ = readAMR(aligned_amr_file) for comment,comment_with_alignment in zip(comments,comments_with_alignment): comment['alignments'] = comment_with_alignment['alignments'] tokenized_sentences = None try: if use_amr_tokens: tokenized_sentences = [c['tok'] for c in comments] # here should be 'snt' if not os.path.exists(tok_sent_filename): with open(tok_sent_filename,'w') as f: for sentence in tokenized_sentences: print >> f, sentence if tokenized_sentences: print >> log, "AMR has tokens, will use them" except: raise pass sentences = [c['snt'] for c in comments] # here should be 'snt' if not os.path.exists(tmp_sent_filename): # write sentences into file _write_sentences(tmp_sent_filename,sentences) print >> log, "Start Stanford CoreNLP..." proc1 = StanfordCoreNLP(tokenize=not tokenized_sentences) # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename if proc1.tokenize else tok_sent_filename) if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) if len(instances) == 0: print 'Error: no instances!' sys.exit(1) if not os.path.exists(amr_tok_file): # write tokenized amr file _write_tok_amr(amr_tok_file,amr_file,instances) if not os.path.exists(aligned_amr_file) and align: # align print "Call JAMR to generate alignment ..." subprocess.call('./scripts/jamr_align.sh '+amr_tok_file,shell=True) print "Reading aligned AMR ..." # read aligned amr and transfer alignment comments comments_with_alignment,_ = readAMR(aligned_amr_file) for comment,comment_with_alignment in zip(comments,comments_with_alignment): comment['alignments'] = comment_with_alignment['alignments'] from progress import Progress p = Progress(len(instances), estimate=True, values=True) print 'Parsing AMR:' SpanGraph.graphID = 0 for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) if 'alignments' in comments[i]: alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) instances[i].addComment(comments[i]) p += 1 p.complete() else: # input file is sentence tmp_sent_filename = input_file print >> log, "Start Stanford CoreNLP ..." proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) # preprocess 2: dependency parsing if constants.FLAG_DEPPARSER == "stanford": dep_filename = tok_sent_filename+'.stanford.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = StanfordDepParser() dep_result = dparser.parse(tok_sent_filename) output_dep = open(dep_filename,'w') output_dep.write(dep_result) output_dep.close() _add_dependency(instances,dep_result) elif constants.FLAG_DEPPARSER == "stanfordConvert": dep_filename = tok_sent_filename+'.stanford.parse.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: raise IOError('Converted dependency file %s not founded' % (dep_filename)) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": dep_filename = tok_sent_filename+'.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) #raise IOError('Converted dependency file %s not founded' % (dep_filename)) print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "clear": dep_filename = tok_sent_filename+'.clear.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = ClearDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "turbo": dep_filename = tok_sent_filename+'.turbo.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = TurboDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "mate": dep_filename = tok_sent_filename+'.mate.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = MateDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) else: pass if constants.FLAG_PROP: print >> log, "Adding SRL information..." prop_filename = tok_sent_filename + '.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True) else: _add_prop(instances,prop_filename,dep_filename) else: raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename)) return instances
def preprocess(input_file, START_SNLP=True, INPUT_AMR='amr'): '''nasty function''' tmp_sent_filename = None instances = None tok_sent_filename = None if INPUT_AMR == 'amr': # the input file is amr annotation amr_file = input_file aligned_amr_file = amr_file + '.amr.tok.aligned' if os.path.exists(aligned_amr_file): comments, amr_strings = readAMR(aligned_amr_file) else: comments, amr_strings = readAMR(amr_file) sentences = [c['snt'] for c in comments] # here should be 'snt' # write sentences(separate per line) tmp_sent_filename = amr_file + '.sent' if not os.path.exists(tmp_sent_filename): # no cache found _write_sentences(tmp_sent_filename, sentences) tmp_prp_filename = tmp_sent_filename + '.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP..." proc1.setup() print >> log, 'Read token,lemma,name entity file %s...' % ( tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename + '.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename, instances) tok_amr_filename = amr_file + '.amr.tok' if not os.path.exists(tok_amr_filename): # write tokenized amr file _write_tok_amr(tok_amr_filename, amr_file, instances) SpanGraph.graphID = 0 for i in xrange(len(instances)): amr = AMR.parse_string(amr_strings[i]) if 'alignments' in comments[i]: alignment, s2c_alignment = Aligner.readJAMRAlignment( amr, comments[i]['alignments']) # use verbalization list to fix the unaligned tokens if constants.FLAG_VERB: Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr, alignment, s2c_alignment, instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addComment(comments[i]) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) elif INPUT_AMR == 'amreval': eval_file = input_file comments = readAMREval(eval_file) sentences = [c['snt'] for c in comments] # write sentences(separate per line) tmp_sent_filename = eval_file + '.sent' if not os.path.exists(tmp_sent_filename): # no cache found _write_sentences(tmp_sent_filename, sentences) tmp_prp_filename = tmp_sent_filename + '.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP ..." proc1.setup() instances = proc1.parse(tmp_sent_filename) elif os.path.exists(tmp_prp_filename): # found cache file print >> log, 'Read token,lemma,name entity file %s...' % ( tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) else: raise Exception( 'No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename)) tok_sent_filename = tmp_sent_filename + '.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename, instances) for i in xrange(len(instances)): instances[i].addComment(comments[i]) else: # input file is sentence tmp_sent_filename = input_file tmp_prp_filename = tmp_sent_filename + '.prp' proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP and not os.path.exists(tmp_prp_filename): print >> log, "Start Stanford CoreNLP ..." proc1.setup() instances = proc1.parse(tmp_sent_filename) elif os.path.exists(tmp_prp_filename): # found cache file print >> log, 'Read token,lemma,name entity file %s...' % ( tmp_prp_filename) instances = proc1.parse(tmp_sent_filename) else: raise Exception( 'No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename)) tok_sent_filename = tmp_sent_filename + '.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename, instances) # preprocess 2: dependency parsing if constants.FLAG_DEPPARSER == "stanford": dep_filename = tok_sent_filename + '.stanford.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read() else: dparser = StanfordDepParser() dep_result = dparser.parse(tok_sent_filename) output_dep = codecs.open(dep_filename, 'w', encoding='utf-8') output_dep.write(dep_result) output_dep.close() _add_dependency(instances, dep_result) elif constants.FLAG_DEPPARSER == "stanfordConvert": dep_filename = tok_sent_filename + '.stanford.parse.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read() else: raise IOError('Converted dependency file %s not founded' % (dep_filename)) _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": if constants.FLAG_ONTO == 'onto': dep_filename = tok_sent_filename + '.charniak.onto.parse.dep' elif constants.FLAG_ONTO == 'onto+bolt': dep_filename = tok_sent_filename + '.charniak.onto+bolt.parse.dep' else: dep_filename = tok_sent_filename + '.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) #raise IOError('Converted dependency file %s not founded' % (dep_filename)) print 'Read dependency file %s...' % (dep_filename) dep_result = codecs.open(dep_filename, 'r', encoding='utf-8').read() _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "clear": dep_filename = tok_sent_filename + '.clear.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename, 'r').read() else: dparser = ClearDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "turbo": dep_filename = tok_sent_filename + '.turbo.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename, 'r').read() else: dparser = TurboDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "mate": dep_filename = tok_sent_filename + '.mate.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename, 'r').read() else: dparser = MateDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances, dep_result, constants.FLAG_DEPPARSER) else: #pass raise Exception('Unknown dependency parse type %s' % (constants.FLAG_DEPPARSER)) if constants.FLAG_PROP: print >> log, "Adding SRL information..." prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances, prop_filename, dep_filename, FIX_PROP_HEAD=True) else: _add_prop(instances, prop_filename, dep_filename) else: raise IOError('Semantic role labeling file %s not found!' % (prop_filename)) if constants.FLAG_RNE: print >> log, "Using rich name entity instead..." rne_filename = tok_sent_filename + '.rne' if os.path.exists(rne_filename): _substitute_rne(instances, rne_filename) else: raise IOError('Rich name entity file %s not found!' % (rne_filename)) return instances
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True): '''nasty function''' tmp_sent_filename = None instances = None tok_sent_filename = None if INPUT_AMR: # the input file is amr annotation amr_file = input_file aligned_amr_file = amr_file + '.amr.tok.aligned' if os.path.exists(aligned_amr_file): print >> log, "Using aligned amr file..." comments,amr_strings = readAMR(aligned_amr_file) else: comments,amr_strings = readAMR(amr_file) sentences = [c['snt'] for c in comments] # here should be 'snt' tmp_sent_filename = amr_file+'.sent' if not os.path.exists(tmp_sent_filename): # write sentences into file _write_sentences(tmp_sent_filename,sentences) print >> log, "Start Stanford CoreNLP..." proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) tok_amr_filename = amr_file + '.amr.tok' if not os.path.exists(tok_amr_filename): # write tokenized amr file _write_tok_amr(tok_amr_filename,amr_file,instances) SpanGraph.graphID = 0 for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) if 'alignments' in comments[i]: alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addComment(comments[i]) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) else: # input file is sentence tmp_sent_filename = input_file print >> log, "Start Stanford CoreNLP ..." proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) # preprocess 2: dependency parsing if constants.FLAG_DEPPARSER == "stanford": dep_filename = tok_sent_filename+'.stanford.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = StanfordDepParser() dep_result = dparser.parse(tok_sent_filename) output_dep = open(dep_filename,'w') output_dep.write(dep_result) output_dep.close() _add_dependency(instances,dep_result) elif constants.FLAG_DEPPARSER == "stanfordConvert": dep_filename = tok_sent_filename+'.stanford.parse.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: raise IOError('Converted dependency file %s not founded' % (dep_filename)) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": dep_filename = tok_sent_filename+'.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) #raise IOError('Converted dependency file %s not founded' % (dep_filename)) print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "clear": dep_filename = tok_sent_filename+'.clear.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = ClearDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "turbo": dep_filename = tok_sent_filename+'.turbo.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = TurboDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "mate": dep_filename = tok_sent_filename+'.mate.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = MateDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) else: pass if constants.FLAG_PROP: print >> log, "Adding SRL information..." prop_filename = tok_sent_filename + '.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True) else: _add_prop(instances,prop_filename,dep_filename) else: raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename)) return instances
def main(argv): ''' command line parameters: neofile - text file with neoantigen data (supplementary data) alignmentDirectory - folder with precomputed alignments a - midpoint parameter of the logistic function, alignment score threshold k - slope parameter of the logistic function outfile - path to a file where to output neoantigen fitness computation ''' neofile=argv[1] alignmentDirectory=argv[2] a=float(argv[3]) k=float(argv[4]) outfile=sys.argv[5] xmlpath=sys.argv[6] [neoantigens,samples]=readNeoantigens(neofile) #Compute TCR-recognition probabilities for all neoantigens aligner=Aligner() #for sample in samples: # xmlpath=alignmentDirectory+"/neoantigens_"+sample+"_iedb.xml" # aligner.readAllBlastAlignments(xmlpath) #xmlpath=alignmentDirectory+"/neoantigens_5NSAK_iedb.xml" #xmlpath="/scratch/eknodel/cohen_melanoma/validated_peptides/3466/3466_lukszablast.out" aligner.readAllBlastAlignments(xmlpath) aligner.computeR(a, k) #Write neoantigen recognition potential of=open(outfile,'w') #header=["NeoantigenID","Mutation","Sample","MutatedPeptide","ResidueChangeClass","MutantPeptide","WildtypePeptide","A","R","Excluded","NeoantigenRecognitionPotential"] header=["NeoantigenID","Mutation","Sample","MutantPeptide","WildtypePeptide","A","R","w","wc","Fitness"] header="\t".join(header) of.write(header+"\n") for i in neoantigens: #print(i) neoantigen=neoantigens[i] #print(neoantigen) w=neoantigen.getHydro() #calculates neoantigen fraction based on Luksza definition wc = neoantigen.getConsortiumHydro() #Calcuculates neoantigen fraction based on Consortium's definition (https://doi.org/10.1016/j.cell.2020.09.015) A=neoantigens[i].getA() #MHC amplitude A #print(A) mtpeptide=neoantigens[i].mtPeptide #mutant peptide wtpeptide=neoantigens[i].wtPeptide R=aligner.getR(i) #print(R) # Residue change: # HH: from hydrophobic to hydrophobic, # NN: from non-hydrophobic to non-hydrophobic # HN: from hydrophobic to non-hydrophobic, # NH: from non-hydrophobic to hydrophobic # other (WW, WH, HW, NW, WN) which include aminoacids without a clear classification #residueChange=neoantigen.residueChange #print(residueChange) fitnessCost=A*R*w #fitnessCost=A*R #l=[i, neoantigen.mid, neoantigen.sample, neoantigen.position, residueChange, mtpeptide, wtpeptide, A,R, 1-w, fitnessCost]#, neoAlignment, epitopeAlignment, score, species] l=[i, neoantigen.mid, neoantigen.sample, mtpeptide, wtpeptide, A,R,w,wc, fitnessCost]#, neoAlignment, epitopeAlignment, score, species] l="\t".join(map(lambda s: str(s),l)) of.write(l+"\n")
def __init__(self, print_param=0, alignment_type='local', gap_scores=[0.2, 0.05]): Aligner.__init__(self, print_param, alignment_type, gap_scores) # PET91 self.scoring_matrix = { 'AA': 15 / 15.0, 'A-': 6 / 15.0, 'AR': 7 / 15.0, 'RR': 15 / 15.0, 'R-': 6 / 15.0, 'AN': 9 / 15.0, 'RN': 8 / 15.0, 'NN': 15 / 15.0, 'N-': 6 / 15.0, 'AD': 8 / 15.0, 'RD': 6 / 15.0, 'ND': 13 / 15.0, 'DD': 15 / 15.0, 'C-': 6 / 15.0, 'AC': 7 / 15.0, 'RC': 8 / 15.0, 'NC': 7 / 15.0, 'DC': 3 / 15.0, 'CC': 15 / 15.0, 'Q-': 6 / 15.0, 'AQ': 7 / 15.0, 'RQ': 12 / 15.0, 'NQ': 9 / 15.0, 'DQ': 9 / 15.0, 'CQ': 4 / 15.0, 'QQ': 15 / 15.0, 'E-': 6 / 15.0, 'AE': 8 / 15.0, 'RE': 7 / 15.0, 'NE': 9 / 15.0, 'DE': 15 / 15.0, 'CE': 2 / 15.0, 'QE': 12 / 15.0, 'EE': 15 / 15.0, 'G-': 6 / 15.0, 'AG': 10 / 15.0, 'RG': 9 / 15.0, 'NG': 9 / 15.0, 'DG': 10 / 15.0, 'CG': 7 / 15.0, 'QG': 6 / 15.0, 'EG': 9 / 15.0, 'GG': 15 / 15.0, 'H-': 6 / 15.0, 'AH': 6 / 15.0, 'RH': 12 / 15.0, 'NH': 12 / 15.0, 'DH': 9 / 15.0, 'CH': 8 / 15.0, 'QH': 14 / 15.0, 'EH': 7 / 15.0, 'GH': 6 / 15.0, 'HH': 15 / 15.0, 'I-': 6 / 15.0, 'AI': 9 / 15.0, 'RI': 4 / 15.0, 'NI': 6 / 15.0, 'DI': 3 / 15.0, 'CI': 5 / 15.0, 'QI': 4 / 15.0, 'EI': 3 / 15.0, 'GI': 4 / 15.0, 'HI': 4 / 15.0, 'II': 15 / 15.0, 'L-': 6 / 15.0, 'AL': 6 / 15.0, 'RL': 5 / 15.0, 'NL': 4 / 15.0, 'DL': 2 / 15.0, 'CL': 5 / 15.0, 'QL': 7 / 15.0, 'EL': 2 / 15.0, 'GL': 2 / 15.0, 'HL': 6 / 15.0, 'IL': 12 / 15.0, 'LL': 15 / 15.0, 'K-': 6 / 15.0, 'AK': 6 / 15.0, 'RK': 14 / 15.0, 'NK': 11 / 15.0, 'DK': 12 / 15.0, 'CK': 4 / 15.0, 'QK': 12 / 15.0, 'EK': 10 / 15.0, 'GK': 6 / 15.0, 'HK': 9 / 15.0, 'IK': 4 / 15.0, 'LK': 4 / 15.0, 'KK': 15 / 15.0, 'M-': 6 / 15.0, 'AM': 8 / 15.0, 'RM': 6 / 15.0, 'NM': 5 / 15.0, 'DM': 3 / 15.0, 'CM': 5 / 15.0, 'QM': 6 / 15.0, 'EM': 4 / 15.0, 'GM': 4 / 15.0, 'HM': 5 / 15.0, 'IM': 13 / 15.0, 'LM': 13 / 15.0, 'KM': 6 / 15.0, 'MM': 15 / 15.0, 'F-': 6 / 15.0, 'AF': 4 / 15.0, 'RF': 2 / 15.0, 'NF': 4 / 15.0, 'DF': 1 / 15.0, 'CF': 8 / 15.0, 'QF': 3 / 15.0, 'EF': 0 / 15.0, 'GF': 1 / 15.0, 'HF': 8 / 15.0, 'IF': 9 / 15.0, 'LF': 12 / 15.0, 'KF': 1 / 15.0, 'MF': 8 / 15.0, 'FF': 15 / 15.0, 'P-': 6 / 15.0, 'AP': 10 / 15.0, 'RP': 8 / 15.0, 'NP': 7 / 15.0, 'DP': 5 / 15.0, 'CP': 5 / 15.0, 'QP': 10 / 15.0, 'EP': 5 / 15.0, 'GP': 6 / 15.0, 'HP': 9 / 15.0, 'IP': 5 / 15.0, 'LP': 8 / 15.0, 'KP': 6 / 15.0, 'MP': 5 / 15.0, 'FP': 5 / 15.0, 'PP': 15 / 15.0, 'S-': 6 / 15.0, 'AS': 12 / 15.0, 'RS': 8 / 15.0, 'NS': 12 / 15.0, 'DS': 8 / 15.0, 'CS': 10 / 15.0, 'QS': 7 / 15.0, 'ES': 7 / 15.0, 'GS': 10 / 15.0, 'HS': 8 / 15.0, 'IS': 7 / 15.0, 'LS': 7 / 15.0, 'KS': 7 / 15.0, 'MS': 7 / 15.0, 'FS': 7 / 15.0, 'PS': 11 / 15.0, 'SS': 15 / 15.0, 'T-': 6 / 15.0, 'AT': 12 / 15.0, 'RT': 7 / 15.0, 'NT': 11 / 15.0, 'DT': 7 / 15.0, 'CT': 7 / 15.0, 'QT': 7 / 15.0, 'ET': 6 / 15.0, 'GT': 8 / 15.0, 'HT': 7 / 15.0, 'IT': 10 / 15.0, 'LT': 6 / 15.0, 'KT': 8 / 15.0, 'MT': 10 / 15.0, 'FT': 4 / 15.0, 'PT': 10 / 15.0, 'ST': 12 / 15.0, 'TT': 15 / 15.0, 'W-': 6 / 15.0, 'AW': 2 / 15.0, 'RW': 9 / 15.0, 'NW': 2 / 15.0, 'DW': 1 / 15.0, 'CW': 10 / 15.0, 'QW': 4 / 15.0, 'EW': 1 / 15.0, 'GW': 7 / 15.0, 'HW': 4 / 15.0, 'IW': 3 / 15.0, 'LW': 6 / 15.0, 'KW': 4 / 15.0, 'MW': 4 / 15.0, 'FW': 7 / 15.0, 'PW': 2 / 15.0, 'SW': 5 / 15.0, 'TW': 3 / 15.0, 'WW': 15 / 15.0, 'Y-': 6 / 15.0, 'AY': 3 / 15.0, 'RY': 5 / 15.0, 'NY': 8 / 15.0, 'DY': 6 / 15.0, 'CY': 12 / 15.0, 'QY': 6 / 15.0, 'EY': 3 / 15.0, 'GY': 2 / 15.0, 'HY': 14 / 15.0, 'IY': 5 / 15.0, 'LY': 6 / 15.0, 'KY': 3 / 15.0, 'MY': 4 / 15.0, 'FY': 15 / 15.0, 'PY': 3 / 15.0, 'SY': 7 / 15.0, 'TY': 4 / 15.0, 'WY': 8 / 15.0, 'YY': 15 / 15.0, 'V-': 6 / 15.0, 'AV': 11 / 15.0, 'RV': 4 / 15.0, 'NV': 5 / 15.0, 'DV': 5 / 15.0, 'CV': 7 / 15.0, 'QV': 4 / 15.0, 'EV': 5 / 15.0, 'GV': 6 / 15.0, 'HV': 4 / 15.0, 'IV': 15 / 15.0, 'LV': 11 / 15.0, 'KV': 4 / 15.0, 'MV': 12 / 15.0, 'FV': 8 / 15.0, 'PV': 7 / 15.0, 'SV': 7 / 15.0, 'TV': 10 / 15.0, 'WV': 4 / 15.0, 'YV': 4 / 15.0, 'VV': 15 / 15.0 } for key in self.scoring_matrix.keys(): if key[1] + key[0] not in self.scoring_matrix.keys(): self.scoring_matrix[key[1] + key[0]] = self.scoring_matrix[key[0] + key[1]]