def createEngine(engineOptions): picklePath = Path(engineOptions["index-file"]) index = deserialize(os.path.join(str(picklePath), "PICKLE.PKL")) if (engineOptions["type"] == "boolean"): return BooleanEngine(index) elif (engineOptions["type"] == "tf-idf"): return TfIdfEngine(index, engineOptions["n"]) elif (engineOptions["type"] == "bm25"): return Bm25Engine(index, engineOptions["n"], engineOptions["k"], engineOptions["b"]) else: raise ValueError("bad engine option")
def synonymWords(self): uniqueWordsPath = Path(os.path.join(self.tcLocation, "TC-words.txt")) uniqueWordsList = loadStopWords(uniqueWordsPath) picklePath = Path(self.indexLocation) index = deserialize(os.path.join(picklePath, "PICKLE.PKL")) wordSynonymFull = [] wordSynonymReduced = [] wordSynonymDictionaryFull = {} wordSynonymDictionaryReduced = {} for entry in index.indexDict.items(): word = entry[0] identifier = entry[1]['id'] synonymSet = wordnet.synsets(word) i = 0 uniqueSynonyms = set() while i < len(synonymSet): uniqueSynonyms.add(synonymSet[i].lemmas()[0].name()) i += 1 filteredSynonyms = uniqueSynonyms.copy() for word in uniqueSynonyms: if word not in uniqueWordsList: filteredSynonyms.remove(word) wordSynonymDictionaryFull['id'] = identifier wordSynonymDictionaryFull['synonyms'] = list(uniqueSynonyms) wordSynonymFull.append(dict(wordSynonymDictionaryFull)) wordSynonymDictionaryReduced['id'] = identifier wordSynonymDictionaryReduced['synonyms'] = list(filteredSynonyms) wordSynonymReduced.append(dict(wordSynonymDictionaryReduced)) synonymJsonPath = Path(self.synonymLocation) synonymFull = json.dumps(wordSynonymFull, indent=4) synonymReduced = json.dumps(wordSynonymReduced, indent=4) if not os.path.exists(synonymJsonPath): os.makedirs(synonymJsonPath) with open( os.path.join(synonymJsonPath, "word-synonyms-wordnet-full.json"), 'w') as outfile: outfile.write(synonymFull) with open( os.path.join(synonymJsonPath, "word-synonyms-wordnet-reduced.json"), 'w') as outfile: outfile.write(synonymReduced)
def deserializeResponse(self, response): # Variable Definition response = Utilities.deserialize(response) # Deserialize response # Code Section # Set print color according to response status color = PrintColors.OKGREEN if response["status"] else PrintColors.FAIL Utilities.logger(color + response["message"]) # Print response Utilities.logger(PrintColors.RESET) return response["status"]
def synonymFullPhrases(self): corpusPhrasesListCleaned = [] picklePath = Path(self.indexLocation) index = deserialize(os.path.join(picklePath, "PICKLE.PKL")) jsonPhrasePath = Path(self.phraseLocation) jsonPhrase = jsonExtractor( os.path.join(jsonPhrasePath, "PHRASE-INDEX.json")) synonymsFullJsonPath = Path(self.synonymLocation) synonymsFullJson = jsonExtractor( os.path.join(synonymsFullJsonPath, "word-synonyms-wordnet-full.json")) phraseSynonymEntry = {} phraseSynonymsList = [] for phraseEntry in jsonPhrase: phraseAllMeanings = [] encodedPhrase = phraseEntry['encoded-phrase'] originalPhrase = phraseEntry['original-phrase'] dpId = phraseEntry['dp-id'] for encodedId in encodedPhrase: for synonymEntry in synonymsFullJson: if synonymEntry['id'] == encodedId: # COMMENT <TAG-002> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS if len(synonymEntry['synonyms']) == 0: reg = re.compile(r'[^A-Za-z]') substitute = re.sub(reg, ' ', originalPhrase) tokenizedPhraseInput = substitute.split() phraseAllMeanings.append(tokenizedPhraseInput) else: # COMMENT </TAG-002> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS phraseAllMeanings.append(synonymEntry['synonyms']) phraseSynonymEntry['dp-id'] = dpId phraseSynonymEntry['synonyms'] = list( itertools.product(*phraseAllMeanings)) phraseSynonymsList.append(dict(phraseSynonymEntry)) synonymJsonPath = Path(self.synonymLocation) if not os.path.exists(synonymJsonPath): os.makedirs(synonymJsonPath) phrasesFullSynonyms = json.dumps(phraseSynonymsList, indent=4) with open( os.path.join(synonymJsonPath, "phrase-synonyms-wordnet-full.json"), 'w') as outfile: outfile.write(phrasesFullSynonyms)
def phraseIndex(self): picklePath = Path(self.indexLocation) index = deserialize(os.path.join(picklePath, "PICKLE.PKL")) entryIdentifier = 0 phraseEntry = {} phraseList = [] docsPath = Path(self.docsLocation) dpDocs = jsonExtractor(docsPath) for dataPoint in dpDocs: dataPointName = dataPoint['name'] tokenizedEntry = word_tokenize(dataPointName) encodedList = [] for term in tokenizedEntry: for entry in index.indexDict.items(): word = entry[0] if word.lower() == term.lower(): identifier = entry[1]['id'] encodedList.append(identifier) break else: continue phraseEntry['dp-id'] = entryIdentifier phraseEntry['original-phrase'] = dataPointName phraseEntry['encoded-phrase'] = encodedList entryIdentifier += 1 phraseList.append(dict(phraseEntry)) phraseObject = json.dumps(phraseList, indent=4) phraseJsonPath = Path(self.indexJson) if not os.path.exists(phraseJsonPath): os.makedirs(phraseJsonPath) with open(os.path.join(phraseJsonPath, "PHRASE-INDEX.json"), 'w') as outfile: outfile.write(phraseObject)
def synonymReducedPhrases(self): corpusPhrasesListCleaned = [] picklePath = Path(self.indexLocation) index = deserialize(os.path.join(picklePath, "PICKLE.PKL")) corpusPhrasesPath = Path( os.path.join(self.tcLocation, "TC-phrases-bi-tri.txt")) corpusPhrasesList = loadStopWords(corpusPhrasesPath) for word in corpusPhrasesList: corpusPhrasesListCleaned.append(word.replace("_", " ").lower()) jsonPhrasePath = Path(self.phraseLocation) jsonPhrase = jsonExtractor( os.path.join(jsonPhrasePath, "PHRASE-INDEX.json")) synonymsFullJsonPath = Path(self.synonymLocation) synonymsFullJson = jsonExtractor( os.path.join(synonymsFullJsonPath, "word-synonyms-wordnet-reduced.json")) phraseSynonymEntry = {} phraseSynonymsList = [] for phraseEntry in jsonPhrase: phraseAllMeanings = [] encodedPhrase = phraseEntry['encoded-phrase'] originalPhrase = phraseEntry['original-phrase'] tokenizedOriginalPhrase = originalPhrase.split() tokenInCorpusFlag = False for token in tokenizedOriginalPhrase: for phrase in corpusPhrasesListCleaned: brokenPhrase = phrase.split() if token.lower() in brokenPhrase: tokenInCorpusFlag = True break if tokenInCorpusFlag == False: continue dpId = phraseEntry['dp-id'] for encodedId in encodedPhrase: for synonymEntry in synonymsFullJson: if synonymEntry['id'] == encodedId: # COMMENT <TAG-001> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS if len(synonymEntry['synonyms']) == 0: reg = re.compile(r'[^A-Za-z]') substitute = re.sub(reg, ' ', originalPhrase) tokenizedPhraseInput = substitute.split() phraseAllMeanings.append(tokenizedPhraseInput) else: # COMMENT </TAG-001> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS phraseAllMeanings.append(synonymEntry['synonyms']) phraseSynonymEntry['dp-id'] = dpId phraseSynonymEntry['synonyms'] = list( itertools.product(*phraseAllMeanings)) phraseSynonymsList.append(dict(phraseSynonymEntry)) synonymJsonPath = Path(self.synonymLocation) if not os.path.exists(synonymJsonPath): os.makedirs(synonymJsonPath) phrasesReducedSynonyms = json.dumps(phraseSynonymsList, indent=4) with open( os.path.join(synonymJsonPath, "phrase-synonyms-wordnet-reduced.json"), 'w') as outfile: outfile.write(phrasesReducedSynonyms)
if TO_SERIALIZE: raw_data = read_data(TRAIN_DIR, nfiles=c_nfiles) # Build the dataset and the dictionaries form the raw data data, dictionary, reverse_dictionary = build_dataset( raw_data, VOCABULARY_SIZE) del raw_data # To reduce memory. # To avoid reading the whole dataset again an again print("Serializing the data.") serialize(DATA_DICT_SERIALIZATION, (data, dictionary, reverse_dictionary)) else: print("Reading serialization: ") data, dictionary, reverse_dictionary = deserialize(DATA_DICT_SERIALIZATION) # Stores some informations about the actual test configuration configuration = 'BATCH_SIZE: ' + str(BATCH_SIZE) + ' EMBEDDING_SIZE: ' + str(EMBEDDING_SIZE) + ' WINDOW_SIZE: ' \ + str(WINDOW_SIZE) + ' VOCABULARY_SIZE ' + str(VOCABULARY_SIZE) + ' nfiles: ' + str(c_nfiles) + \ ' stopwd: ' + str(c_stopwd) + ' shuffle_docs: ' + str(c_shuffle_docs) + ' dataset_size ' + str(len(data)) print("CONFIG: " + configuration) # Read the question file for the Analogical Reasoning evaluation questions = read_analogies(ANALOGIES_FILE, dictionary) # ------------------------------------------ MODEL DEFINITION -------------------------------------------------------- graph = tf.Graph() evall = None