def synonymFullPhrases(self): corpusPhrasesListCleaned = [] picklePath = Path(self.indexLocation) index = deserialize(os.path.join(picklePath, "PICKLE.PKL")) jsonPhrasePath = Path(self.phraseLocation) jsonPhrase = jsonExtractor( os.path.join(jsonPhrasePath, "PHRASE-INDEX.json")) synonymsFullJsonPath = Path(self.synonymLocation) synonymsFullJson = jsonExtractor( os.path.join(synonymsFullJsonPath, "word-synonyms-wordnet-full.json")) phraseSynonymEntry = {} phraseSynonymsList = [] for phraseEntry in jsonPhrase: phraseAllMeanings = [] encodedPhrase = phraseEntry['encoded-phrase'] originalPhrase = phraseEntry['original-phrase'] dpId = phraseEntry['dp-id'] for encodedId in encodedPhrase: for synonymEntry in synonymsFullJson: if synonymEntry['id'] == encodedId: # COMMENT <TAG-002> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS if len(synonymEntry['synonyms']) == 0: reg = re.compile(r'[^A-Za-z]') substitute = re.sub(reg, ' ', originalPhrase) tokenizedPhraseInput = substitute.split() phraseAllMeanings.append(tokenizedPhraseInput) else: # COMMENT </TAG-002> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS phraseAllMeanings.append(synonymEntry['synonyms']) phraseSynonymEntry['dp-id'] = dpId phraseSynonymEntry['synonyms'] = list( itertools.product(*phraseAllMeanings)) phraseSynonymsList.append(dict(phraseSynonymEntry)) synonymJsonPath = Path(self.synonymLocation) if not os.path.exists(synonymJsonPath): os.makedirs(synonymJsonPath) phrasesFullSynonyms = json.dumps(phraseSynonymsList, indent=4) with open( os.path.join(synonymJsonPath, "phrase-synonyms-wordnet-full.json"), 'w') as outfile: outfile.write(phrasesFullSynonyms)
def searchWithoutSynonyms(config): analyzer = createAnalyzer(config["analyzer-options"]) engine = createEngine(config["engine-options"]) while (True): matchList = runInputSearchPrintLoopOld(engine, analyzer, config) resultList = [] jsonTextDpDocs = jsonExtractor(config["docs-file"]) for match in matchList["matches"]: for obj in jsonTextDpDocs: if match[0] == obj.get("id"): description = obj.get("description") definition = obj.get("definition") name = obj.get("name") resultList.append( Result(match[0], name, description, definition)) for obj in resultList: print("\nDOC = ", obj.getName(), "\nVALUE = ", obj.getIdValue()) print("DESCRIPTION = ", obj.getDescription()) print("DEFINITION = ", obj.getDefinition()) print("\nTotal Results returned: " + str(len(resultList))) outputPath = Path(config["engine-options"]["output-path"]) if not os.path.exists(str(outputPath)): os.makedirs(os.path.join(str(outputPath))) with open(os.path.join(str(outputPath), "OUTPUT.TXT"), 'w') as outfile: header = str(config["engine-options"]["type"]) + " " + str( config["analyzer-options"]["type"]) + " RESULTS for " + str( matchList["input"]) + ":" outfile.write(str(header)) outfile.write("\nTotal Results returned: " + str(len(resultList))) for obj in resultList: doc = obj.getName() value = obj.getIdValue() descriptionValue = obj.getDescription() definitionValue = obj.getDefinition() outfile.write("\n\nDOC: " + str(doc)) outfile.write("\nValue: " + str(value)) outfile.write("\nDescription: " + str(descriptionValue)) outfile.write("\nDefinition: " + str(definitionValue))
def synGenerator(self): model = Word2Vec.load(self.modelFile) index = jsonExtractor(os.path.join(self.indexFile, "INDEX.JSON")) listOfValues = [] for key, value in index.items(): listOfValues.append(key) synSets = {} for word in sorted(listOfValues): try: if word not in synSets.keys(): synSets[word] = model.most_similar(word, topn=10) except: continue synSetsJson = json.dumps(synSets, indent=4) if not os.path.exists(self.outputFile): os.mkdir(self.outputFile) with open(os.path.join(self.outputFile, 'w2vSynonyms.json'), 'w') as outfile: outfile.write(synSetsJson)
def textBreakdown(self): #analyzer = createAnalyzer(self.analyzerParam) analyzer = BaseAnalyzer() fileCounter = 0 dataPaths = jsonExtractor(self.DataFile) for path, score in dataPaths.items(): if score < 0.1374: filepath = path print(filepath) with open(filepath) as inFile: fileCounter += 1 for lines in inFile: processedLines = sent_tokenize(lines) for line in processedLines: lineAnalyzed = analyzer.analyze("DUMMY", line) lineAfterAnalyzer = "" for finalToken in lineAnalyzed: lineAfterAnalyzer += " " + finalToken.getForm() if len(lineAfterAnalyzer) == 0: continue path = Path(filepath) file = path.parts[-1] directory = path.parts[-2] if not os.path.exists(os.path.join(self.corpusDestination, directory)): os.makedirs(os.path.join(self.corpusDestination, directory)) with open(os.path.join(self.corpusDestination, directory, file), "a") as outFile: if lineAfterAnalyzer.endswith("\n"): lineAfterAnalyzer.replace("\n", "") outFile.write(lineAfterAnalyzer + "\n") else: outFile.write(lineAfterAnalyzer + "\n") else: with open(os.path.join(self.corpusDestination, directory, file), "a") as outFile: if lineAfterAnalyzer.endswith('\n'): lineAfterAnalyzer.replace("\n", "") outFile.write(lineAfterAnalyzer + "\n") else: outFile.write(lineAfterAnalyzer + "\n") print("\nTotal TXT processed = ", fileCounter)
def phraseIndex(self): picklePath = Path(self.indexLocation) index = deserialize(os.path.join(picklePath, "PICKLE.PKL")) entryIdentifier = 0 phraseEntry = {} phraseList = [] docsPath = Path(self.docsLocation) dpDocs = jsonExtractor(docsPath) for dataPoint in dpDocs: dataPointName = dataPoint['name'] tokenizedEntry = word_tokenize(dataPointName) encodedList = [] for term in tokenizedEntry: for entry in index.indexDict.items(): word = entry[0] if word.lower() == term.lower(): identifier = entry[1]['id'] encodedList.append(identifier) break else: continue phraseEntry['dp-id'] = entryIdentifier phraseEntry['original-phrase'] = dataPointName phraseEntry['encoded-phrase'] = encodedList entryIdentifier += 1 phraseList.append(dict(phraseEntry)) phraseObject = json.dumps(phraseList, indent=4) phraseJsonPath = Path(self.indexJson) if not os.path.exists(phraseJsonPath): os.makedirs(phraseJsonPath) with open(os.path.join(phraseJsonPath, "PHRASE-INDEX.json"), 'w') as outfile: outfile.write(phraseObject)
def synonymReducedPhrases(self): corpusPhrasesListCleaned = [] picklePath = Path(self.indexLocation) index = deserialize(os.path.join(picklePath, "PICKLE.PKL")) corpusPhrasesPath = Path( os.path.join(self.tcLocation, "TC-phrases-bi-tri.txt")) corpusPhrasesList = loadStopWords(corpusPhrasesPath) for word in corpusPhrasesList: corpusPhrasesListCleaned.append(word.replace("_", " ").lower()) jsonPhrasePath = Path(self.phraseLocation) jsonPhrase = jsonExtractor( os.path.join(jsonPhrasePath, "PHRASE-INDEX.json")) synonymsFullJsonPath = Path(self.synonymLocation) synonymsFullJson = jsonExtractor( os.path.join(synonymsFullJsonPath, "word-synonyms-wordnet-reduced.json")) phraseSynonymEntry = {} phraseSynonymsList = [] for phraseEntry in jsonPhrase: phraseAllMeanings = [] encodedPhrase = phraseEntry['encoded-phrase'] originalPhrase = phraseEntry['original-phrase'] tokenizedOriginalPhrase = originalPhrase.split() tokenInCorpusFlag = False for token in tokenizedOriginalPhrase: for phrase in corpusPhrasesListCleaned: brokenPhrase = phrase.split() if token.lower() in brokenPhrase: tokenInCorpusFlag = True break if tokenInCorpusFlag == False: continue dpId = phraseEntry['dp-id'] for encodedId in encodedPhrase: for synonymEntry in synonymsFullJson: if synonymEntry['id'] == encodedId: # COMMENT <TAG-001> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS if len(synonymEntry['synonyms']) == 0: reg = re.compile(r'[^A-Za-z]') substitute = re.sub(reg, ' ', originalPhrase) tokenizedPhraseInput = substitute.split() phraseAllMeanings.append(tokenizedPhraseInput) else: # COMMENT </TAG-001> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS phraseAllMeanings.append(synonymEntry['synonyms']) phraseSynonymEntry['dp-id'] = dpId phraseSynonymEntry['synonyms'] = list( itertools.product(*phraseAllMeanings)) phraseSynonymsList.append(dict(phraseSynonymEntry)) synonymJsonPath = Path(self.synonymLocation) if not os.path.exists(synonymJsonPath): os.makedirs(synonymJsonPath) phrasesReducedSynonyms = json.dumps(phraseSynonymsList, indent=4) with open( os.path.join(synonymJsonPath, "phrase-synonyms-wordnet-reduced.json"), 'w') as outfile: outfile.write(phrasesReducedSynonyms)
w2vSynGenObj.synGenerator() def synonymGeneration(config, wordsOrPhrasesParam): synonymObject = Synonyms(config) if (wordsOrPhrasesParam == "words"): synonymObject.synonymWords() elif (wordsOrPhrasesParam == "phrases"): synonymObject.synonymFullPhrases() synonymObject.synonymReducedPhrases() if __name__ == "__main__": doWhat = sys.argv[1] config = jsonExtractor(sys.argv[2]) print("CONFIG = ", config) if (doWhat == "Evaluate"): CorpusEvaluationStart = time.time() evaluation = evalCorpus(config) evaluation.eval() CorpusEvaluationEnd = time.time() totalTimeForCorpusEvaluation = CorpusEvaluationEnd - CorpusEvaluationStart print("Elaspsed Time for Corpus Evaluation = ", totalTimeForCorpusEvaluation, "seconds") elif (doWhat == "pre-process"): corpusCreationStart = time.time() preProcessedCorpusCreation(config) corpusCreationEnd = time.time() totalTimeForCorpus = corpusCreationEnd - corpusCreationStart