def word2phrase_filelevel(formatCorpusFile, depFile, outputPhraseCorpusFile, outputPhraseDepFile): """ func: transfer word based corpus into phrase based one :param formatCorpusFile: the file that needs to be transfered :param depFile: the corresponding dependency file :param outputPhraseCorpusFile: the transfered phrase based output file :param outputPhraseDepFile: the transfered phrase based dependency file :return: n/a """ phrasemap = buildBilingualDict.phraseMapping('../data/phrase.lst') sentences_triples = readInDependencyTriples(depFile) sentences, clbls, flbls = get_english_raw_sentences_labels(formatCorpusFile) assert len(sentences) == len(sentences_triples) newDepinfo = [] newCorpus = [] for i, sent in enumerate(sentences): sent_triples = sentences_triples[i] newdeps = mergeDependencyTree(sent_triples, sent, phrasemap) newsent = word2phrase_sentencelevel(sent, phrasemap) newDepinfo.append(formDependencyTripleLine(newdeps)) newCorpus.append(flbls[i]+'\t'+newsent+'\n') with open(outputPhraseCorpusFile, 'w') as writer: writer.writelines(newCorpus) with open(outputPhraseDepFile, 'w') as writer: writer.writelines(newDepinfo)
def word2phrase_filelevel(formatCorpusFile, depFile, outputPhraseCorpusFile, outputPhraseDepFile): """ func: transfer word based corpus into phrase based one :param formatCorpusFile: the file that needs to be transfered :param depFile: the corresponding dependency file :param outputPhraseCorpusFile: the transfered phrase based output file :param outputPhraseDepFile: the transfered phrase based dependency file :return: n/a """ phrasemap = buildBilingualDict.phraseMapping('../data/phrase.lst') sentences_triples = readInDependencyTriples(depFile) sentences, clbls, flbls = get_english_raw_sentences_labels( formatCorpusFile) assert len(sentences) == len(sentences_triples) newDepinfo = [] newCorpus = [] for i, sent in enumerate(sentences): sent_triples = sentences_triples[i] newdeps = mergeDependencyTree(sent_triples, sent, phrasemap) newsent = word2phrase_sentencelevel(sent, phrasemap) newDepinfo.append(formDependencyTripleLine(newdeps)) newCorpus.append(flbls[i] + '\t' + newsent + '\n') with open(outputPhraseCorpusFile, 'w') as writer: writer.writelines(newCorpus) with open(outputPhraseDepFile, 'w') as writer: writer.writelines(newDepinfo)
def preprocessEnglishCorpus(CorpusFile, preprocessedFile): """ func: preprocess corpus params: CorpusFile: corpus file path params: preprocessedFile: the output preprocessed file return: n/a """ print 'preprocessing...' reader = open(CorpusFile, 'r') buffsize = 250000000 buffcount = 0 open(preprocessedFile, 'w').close() phrasemap = buildBilingualDict.phraseMapping('../data/phrase.lst') while True: outputbuffer = [] lines = reader.readlines(buffsize) if not lines: break else: buffcount += 1 print 'building with ' + str(buffcount) + ' buffer.....' for line in lines: words = line.split() newwords = [] for word in words: newwords.extend(english_word_filter(word)) newwords = english_phrase_filter(newwords, phrasemap) outputbuffer.append(' '.join(newwords) + '\n') print 'writing buffer...' with open(preprocessedFile, 'a') as writer: writer.writelines(outputbuffer) reader.close()