def main(type_atc, argv): list_relations = ['AN', 'SV', 'VO'] date_start = datetime.datetime.now() date_start = date_start.strftime("%Y-%m-%d %H:%M:%S") parameters = Parameters(type_atc, argv) contexts = parameters.getContexts() svd_dimension = int(parameters.getSvdDimension()) input_folder = parameters.getInputFolder() language = parameters.getLanguage() min_word_size = parameters.getMinWordSize() max_qty_terms = int(parameters.getMaxQtyTerms()) output_folder = parameters.getOutputFolder() temp_folder = parameters.getTempFolder() record_log = parameters.getRecordLog() record_intermediate = parameters.getRecordIntermediate() seeds_file = parameters.getSeedsFile() stoplist_file = parameters.getStoplistFile() sim_measure = parameters.getSimilarityMeasure() del parameters logfile = LogFile(record_log, str(date_start), svd_dimension, input_folder, language, stoplist_file, min_word_size, max_qty_terms, None, output_folder, None, temp_folder, seeds_file, sim_measure) #if contexts: # logfile.writeLogfile('- Building syntactics relations from '+temp_folder) # contexts = Contexts(temp_folder) # del contexts #else: # logfile.writeLogfile('- Building syntactics relations from '+input_folder) # ling_corpus = StanfordSyntacticContexts(input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate) # del ling_corpus matrix_relation = Matrix(temp_folder, svd_dimension, record_intermediate) del matrix_relation #similarities = Similarities(seeds_file, temp_folder, 'cosine') #dic_topn = similarities.getTopNOrderedDic(10) #del Similarities #logfile.writeLogfile('- Building thesaurus in '+output_folder+'T_'+type_atc+'_'+sim_measure+'.xml') #thesaurus = Thesaurus(output_folder+'T_'+type_atc+'_'+sim_measure+'.xml',max_qty_terms) #thesaurus.write(dic_topn) #del thesaurus date_end = datetime.datetime.now() date_end = date_end.strftime("%Y-%m-%d %H:%M:%S") logfile.writeLogfile( '- Thesaurus sucessfully built!\nEnding process at: ' + str(date_end) + '.\n') del logfile
def main(type_atc, argv): list_relations = ['AN', 'SV', 'VO'] date_start = datetime.datetime.now() date_start = date_start.strftime("%Y-%m-%d %H:%M:%S") parameters = Parameters(type_atc, argv) contexts = parameters.getContexts() svd_dimension = int(parameters.getSvdDimension()) input_folder = parameters.getInputFolder() language = parameters.getLanguage() min_word_size = parameters.getMinWordSize() max_qty_terms = int(parameters.getMaxQtyTerms()) output_folder = parameters.getOutputFolder() temp_folder = parameters.getTempFolder() record_log = parameters.getRecordLog() record_intermediate = parameters.getRecordIntermediate() seeds_file = parameters.getSeedsFile() stoplist_file = parameters.getStoplistFile() sim_measure = parameters.getSimilarityMeasure() del parameters logfile = LogFile(record_log, str(date_start), svd_dimension, input_folder, language, stoplist_file, min_word_size, max_qty_terms, None, output_folder, None, temp_folder, seeds_file, sim_measure) #if contexts: # logfile.writeLogfile('- Building syntactics relations from '+temp_folder) # contexts = Contexts(temp_folder) # del contexts #else: # logfile.writeLogfile('- Building syntactics relations from '+input_folder) # ling_corpus = StanfordSyntacticContexts(input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate) # del ling_corpus matrix_relation = Matrix(temp_folder, svd_dimension, record_intermediate) del matrix_relation #similarities = Similarities(seeds_file, temp_folder, 'cosine') #dic_topn = similarities.getTopNOrderedDic(10) #del Similarities #logfile.writeLogfile('- Building thesaurus in '+output_folder+'T_'+type_atc+'_'+sim_measure+'.xml') #thesaurus = Thesaurus(output_folder+'T_'+type_atc+'_'+sim_measure+'.xml',max_qty_terms) #thesaurus.write(dic_topn) #del thesaurus date_end = datetime.datetime.now() date_end = date_end.strftime("%Y-%m-%d %H:%M:%S") logfile.writeLogfile('- Thesaurus sucessfully built!\nEnding process at: '+str(date_end)+'.\n') del logfile
def main(type_atc, argv): date_start = datetime.datetime.now() date_start = date_start.strftime("%Y-%m-%d %H:%M:%S") parameters = Parameters(type_atc, argv) contexts = parameters.getContexts() input_folder = parameters.getInputFolder() language = parameters.getLanguage() min_word_size = int(parameters.getMinWordSize()) max_qty_terms = int(parameters.getMaxQtyTerms()) mi_precision = parameters.getMIPrecision() output_folder = parameters.getOutputFolder() window_size = parameters.getWindowSize() temp_folder = parameters.getTempFolder() record_log = parameters.getRecordLog() record_intermediate = parameters.getRecordIntermediate() seeds_file = parameters.getSeedsFile() sim_measure = parameters.getSimilarityMeasure() del parameters logfile = LogFile(record_log, str(date_start), None, input_folder, language, None, min_word_size, max_qty_terms, mi_precision, output_folder, window_size, temp_folder, seeds_file, sim_measure) stat_corpus = StatisticalCorpus(input_folder, temp_folder, min_word_size, window_size) if not contexts: logfile.writeLogfile('- Building statistical corpus at ' + temp_folder) if language == 'pt': stat_corpus.buildCorpus_pt() param_nsp = '--token ../misc/tokens_nsp.pl' elif language == 'en': stat_corpus.buildCorpus_en() param_nsp = '' """ Uses count.pl from NGram Statistical Package (NSP) to get Bigrams in a window """ logfile.writeLogfile('- Getting bigrams to W' + window_size + '_Statistical_corpus.txt') command = 'count.pl --ngram 2 ' + param_nsp + ' --window ' + window_size + ' ' + temp_folder + 'W' + window_size + '_Statistical_corpus.txt ' + temp_folder + 'Statistical_corpus.txt' os.system(command) logfile.writeLogfile('- Using ' + sim_measure + ' as similarity measure') if sim_measure == 'mutual_information': mi = MutualInformation( temp_folder, 'W' + window_size + '_Statistical_corpus.txt', seeds_file, mi_precision) dic_terms = mi.getDicMI() del mi else: stat_corpus.buildSTRelations( 'W' + window_size + '_Statistical_corpus.txt', seeds_file) measures = Measures( temp_folder + 'W' + window_size + '_Relations.txt', seeds_file) dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures else: measures = Measures(temp_folder + 'W' + window_size + '_Relations.txt', seeds_file) dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures del stat_corpus logfile.writeLogfile('- Building thesaurus in ' + output_folder + 'T' + window_size + '_' + type_atc + '_' + sim_measure + '.xml') thesaurus = Thesaurus( output_folder + 'T' + window_size + '_' + type_atc + '_' + sim_measure + '.xml', max_qty_terms) thesaurus.write(dic_terms) del thesaurus date_end = datetime.datetime.now() date_end = date_end.strftime("%Y-%m-%d %H:%M:%S") logfile.writeLogfile( '- Thesaurus sucessfully built!\nEnding process at: ' + str(date_end) + '.\n') del logfile
def main(type_atc, argv): date_start = datetime.datetime.now() date_start = date_start.strftime("%Y-%m-%d %H:%M:%S") parameters = Parameters(type_atc, argv) contexts = parameters.getContexts() input_folder = parameters.getInputFolder() language = parameters.getLanguage() min_word_size = parameters.getMinWordSize() max_qty_terms = int(parameters.getMaxQtyTerms()) output_folder = parameters.getOutputFolder() temp_folder = parameters.getTempFolder() record_log = parameters.getRecordLog() record_intermediate = parameters.getRecordIntermediate() seeds_file = parameters.getSeedsFile() stoplist_file = parameters.getStoplistFile() sim_measure = parameters.getSimilarityMeasure() del parameters logfile = LogFile(record_log, str(date_start), None, input_folder, language, stoplist_file, min_word_size, max_qty_terms, None, output_folder, None, temp_folder, seeds_file, sim_measure) if contexts: logfile.writeLogfile('- Building syntactics relations from '+temp_folder) contexts = Contexts(temp_folder) del contexts else: logfile.writeLogfile('- Building syntactics relations from '+input_folder) ling_corpus = StanfordSyntacticContexts(input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate) del ling_corpus logfile.writeLogfile('- Merging terms to '+temp_folder+'Relations2ndOrder.txt') command = 'cat '+temp_folder+'AN_Relations.txt '+temp_folder+'SV_Relations.txt '+temp_folder+'VO_Relations.txt '+' > '+temp_folder+'Relations2ndOrder.txt' os.system(command) logfile.writeLogfile('- Calculating similarity using '+sim_measure) measures = Measures(temp_folder+'Relations2ndOrder.txt', seeds_file) dic_topn = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures logfile.writeLogfile('- Building thesaurus in '+output_folder+'T_'+type_atc+'_'+sim_measure+'.xml') thesaurus = Thesaurus(output_folder+'T_'+type_atc+'_'+sim_measure+'.xml',max_qty_terms) thesaurus.write(dic_topn) del thesaurus date_end = datetime.datetime.now() date_end = date_end.strftime("%Y-%m-%d %H:%M:%S") logfile.writeLogfile('- Thesaurus sucessfully built!\nEnding process at: '+str(date_end)+'.\n') del logfile
def main(type_atc, argv): date_start = datetime.datetime.now() date_start = date_start.strftime("%Y-%m-%d %H:%M:%S") parameters = Parameters(type_atc, argv) contexts = parameters.getContexts() input_folder = parameters.getInputFolder() language = parameters.getLanguage() min_word_size = int(parameters.getMinWordSize()) max_qty_terms = int(parameters.getMaxQtyTerms()) mi_precision = parameters.getMIPrecision() output_folder = parameters.getOutputFolder() window_size = parameters.getWindowSize() temp_folder = parameters.getTempFolder() record_log = parameters.getRecordLog() record_intermediate = parameters.getRecordIntermediate() seeds_file = parameters.getSeedsFile() sim_measure = parameters.getSimilarityMeasure() del parameters logfile = LogFile(record_log, str(date_start), None, input_folder, language, None, min_word_size, max_qty_terms, mi_precision, output_folder, window_size, temp_folder, seeds_file, sim_measure) stat_corpus = StatisticalCorpus(input_folder, temp_folder, min_word_size, window_size) if not contexts: logfile.writeLogfile('- Building statistical corpus at '+temp_folder) if language == 'pt': stat_corpus.buildCorpus_pt() param_nsp = '--token ../misc/tokens_nsp.pl' elif language == 'en': stat_corpus.buildCorpus_en() param_nsp = '' """ Uses count.pl from NGram Statistical Package (NSP) to get Bigrams in a window """ logfile.writeLogfile('- Getting bigrams to W'+window_size+'_Statistical_corpus.txt') command = 'count.pl --ngram 2 '+param_nsp+' --window '+window_size+' '+temp_folder+'W'+window_size+'_Statistical_corpus.txt '+temp_folder+'Statistical_corpus.txt' os.system(command) logfile.writeLogfile('- Using '+sim_measure+' as similarity measure') if sim_measure == 'mutual_information': mi = MutualInformation(temp_folder, 'W'+window_size+'_Statistical_corpus.txt', seeds_file, mi_precision) dic_terms = mi.getDicMI() del mi else: stat_corpus.buildSTRelations('W'+window_size+'_Statistical_corpus.txt', seeds_file) measures = Measures(temp_folder+'W'+window_size+'_Relations.txt', seeds_file) dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures else: measures = Measures(temp_folder+'W'+window_size+'_Relations.txt', seeds_file) dic_terms = measures.getTopNToAllSeeds(sim_measure, max_qty_terms) del measures del stat_corpus logfile.writeLogfile('- Building thesaurus in '+output_folder+'T'+window_size+'_'+type_atc+'_'+sim_measure+'.xml') thesaurus = Thesaurus(output_folder+'T'+window_size+'_'+type_atc+'_'+sim_measure+'.xml',max_qty_terms) thesaurus.write(dic_terms) del thesaurus date_end = datetime.datetime.now() date_end = date_end.strftime("%Y-%m-%d %H:%M:%S") logfile.writeLogfile('- Thesaurus sucessfully built!\nEnding process at: '+str(date_end)+'.\n') del logfile