def extractFullMIToThesaurus(): accents = Accents() parameters = Parameters() max_qty_terms = parameters.getMaxQtyTerms() seeds = Seeds() dic_seeds = seeds.getSeeds() mi_file = Statistic(stat_temp+'IMT_FullStatisticalCorpus.txt') try: thesaurus_file = codecs.open('../Data/Output/T3/T3_Jaccard.xml', 'w', 'utf-8') except IOError: print 'ERROR: System cannot open the file ../Data/Output/T3/T3_Jaccard.xml' sys.exit() thesaurus_file.write('<?xml version="1.0" encoding="ISO-8859-1"?>\n<thesaurus>\n\t<ontology id="privacy">\n') for seed in dic_seeds: qty_terms = 0 dic_related = mi_file.getOrderedNounMIForTerm(seed) if dic_related != False: thesaurus_file.write('\t\t<seed term_id="" term_name="'+accents.buildAccents(seed)+'" type="">\n') for mi_related in dic_related: if qty_terms < max_qty_terms: thesaurus_file.write('\t\t\t<term id="" display="ON" similarity="'+mi_related[0]+'">'+accents.buildAccents(mi_related[1])+'</term>\n') qty_terms += 1 thesaurus_file.write('\t\t</seed>\n') thesaurus_file.write('\t</ontology>\n</thesaurus>') thesaurus_file.close()
class Seeds: def __init__(self): self.dic_seeds = {} self.accents = Accents() self.__buildDic__() def __buildDic__(self): try: file_seeds = codecs.open('seeds.txt', 'r', 'utf-8') except IOError: print 'ERROR: System cannot open the seeds.txt file' sys.exit() for line in file_seeds: if line != '': line = line.replace('\n','') line = self.accents.buildCodes(line) self.dic_seeds[line] = line file_seeds.close() def getQtySeeds(self): return len(self.dic_seeds) def getSeeds(self): return sorted(self.dic_seeds.keys()) def printSeeds(self): print self.dic_seeds def printQtySeeds(self): print len(self.dic_seeds)
def __buildStatisticalCorpus__(self): try: root, dirs, files = os.walk(self.corpus_folder).next()[:3] except: print 'ERROR: It was not possible to open the ../Data/Corpus/Raw/ folder' sys.exit() accents = Accents() for corpus_file in files: if re.match('.*xml$', corpus_file): corpus_filename = corpus_file.split('.')[0] xmlfile = ParseXml(root+''+corpus_file) dic_terms = xmlfile.getDicTerms() dic_nouns = xmlfile.getNouns() dic_verbs = xmlfile.getVerbs() id_sentence = 1 id_word = 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) string_full = '' string_nouns = '' while dic_terms.has_key(id_t): while dic_terms.has_key(id_t): if not re.match('^(pu|num|conj|art|prp|spec)', dic_terms[id_t]['pos']) and (re.search('[$]', dic_terms[id_t]['lemma']) is None) and (len(dic_terms[id_t]['lemma']) >= self.parameters.getMinWordSize()): lemma = accents.buildCodes(dic_terms[id_t]['lemma']) if dic_nouns.has_key(id_t): string_nouns += lemma+'__N ' string_full += lemma+'__N ' elif dic_verbs.has_key(id_t): string_nouns += lemma+'__V ' string_full += lemma+'__V ' else: string_full += lemma+'__O ' string_nouns = string_nouns.replace('-', '_') string_full = string_full.replace('-', '_') id_word += 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) id_word = 1 id_sentence += 1 id_t = 's'+str(id_sentence)+'_'+str(id_word) self.__writeCorpusFile__(corpus_filename, string_full, string_nouns)
def __init__(self): self.dic_seeds = {} self.accents = Accents() self.__buildDic__()