def synonymFullPhrases(self):
        corpusPhrasesListCleaned = []
        picklePath = Path(self.indexLocation)
        index = deserialize(os.path.join(picklePath, "PICKLE.PKL"))

        jsonPhrasePath = Path(self.phraseLocation)
        jsonPhrase = jsonExtractor(
            os.path.join(jsonPhrasePath, "PHRASE-INDEX.json"))

        synonymsFullJsonPath = Path(self.synonymLocation)
        synonymsFullJson = jsonExtractor(
            os.path.join(synonymsFullJsonPath,
                         "word-synonyms-wordnet-full.json"))

        phraseSynonymEntry = {}
        phraseSynonymsList = []

        for phraseEntry in jsonPhrase:

            phraseAllMeanings = []
            encodedPhrase = phraseEntry['encoded-phrase']
            originalPhrase = phraseEntry['original-phrase']
            dpId = phraseEntry['dp-id']
            for encodedId in encodedPhrase:
                for synonymEntry in synonymsFullJson:
                    if synonymEntry['id'] == encodedId:
                        # COMMENT <TAG-002> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS
                        if len(synonymEntry['synonyms']) == 0:
                            reg = re.compile(r'[^A-Za-z]')
                            substitute = re.sub(reg, ' ', originalPhrase)
                            tokenizedPhraseInput = substitute.split()
                            phraseAllMeanings.append(tokenizedPhraseInput)
                        else:
                            # COMMENT </TAG-002> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS
                            phraseAllMeanings.append(synonymEntry['synonyms'])

            phraseSynonymEntry['dp-id'] = dpId
            phraseSynonymEntry['synonyms'] = list(
                itertools.product(*phraseAllMeanings))
            phraseSynonymsList.append(dict(phraseSynonymEntry))

        synonymJsonPath = Path(self.synonymLocation)
        if not os.path.exists(synonymJsonPath):
            os.makedirs(synonymJsonPath)
        phrasesFullSynonyms = json.dumps(phraseSynonymsList, indent=4)

        with open(
                os.path.join(synonymJsonPath,
                             "phrase-synonyms-wordnet-full.json"),
                'w') as outfile:
            outfile.write(phrasesFullSynonyms)
Ejemplo n.º 2
0
def searchWithoutSynonyms(config):
    analyzer = createAnalyzer(config["analyzer-options"])
    engine = createEngine(config["engine-options"])

    while (True):
        matchList = runInputSearchPrintLoopOld(engine, analyzer, config)
        resultList = []
        jsonTextDpDocs = jsonExtractor(config["docs-file"])

        for match in matchList["matches"]:
            for obj in jsonTextDpDocs:
                if match[0] == obj.get("id"):
                    description = obj.get("description")
                    definition = obj.get("definition")
                    name = obj.get("name")
                    resultList.append(
                        Result(match[0], name, description, definition))

        for obj in resultList:
            print("\nDOC = ", obj.getName(), "\nVALUE = ", obj.getIdValue())
            print("DESCRIPTION = ", obj.getDescription())
            print("DEFINITION = ", obj.getDefinition())
        print("\nTotal Results returned: " + str(len(resultList)))

        outputPath = Path(config["engine-options"]["output-path"])
        if not os.path.exists(str(outputPath)):
            os.makedirs(os.path.join(str(outputPath)))

        with open(os.path.join(str(outputPath), "OUTPUT.TXT"), 'w') as outfile:
            header = str(config["engine-options"]["type"]) + " " + str(
                config["analyzer-options"]["type"]) + " RESULTS for " + str(
                    matchList["input"]) + ":"
            outfile.write(str(header))
            outfile.write("\nTotal Results returned: " + str(len(resultList)))
            for obj in resultList:
                doc = obj.getName()
                value = obj.getIdValue()
                descriptionValue = obj.getDescription()
                definitionValue = obj.getDefinition()
                outfile.write("\n\nDOC: " + str(doc))
                outfile.write("\nValue: " + str(value))
                outfile.write("\nDescription: " + str(descriptionValue))
                outfile.write("\nDefinition: " + str(definitionValue))
Ejemplo n.º 3
0
 def synGenerator(self):
     model = Word2Vec.load(self.modelFile)
     index = jsonExtractor(os.path.join(self.indexFile, "INDEX.JSON"))
     listOfValues = []
     for key, value in index.items():
         listOfValues.append(key)
     synSets = {}
     for word in sorted(listOfValues):
         try:
             if word not in synSets.keys():
                 synSets[word] = model.most_similar(word, topn=10)
         except:
             continue
     synSetsJson = json.dumps(synSets, indent=4)
     if not os.path.exists(self.outputFile):
         os.mkdir(self.outputFile)
     with open(os.path.join(self.outputFile, 'w2vSynonyms.json'),
               'w') as outfile:
         outfile.write(synSetsJson)
 def textBreakdown(self):
     #analyzer = createAnalyzer(self.analyzerParam)
     analyzer = BaseAnalyzer()
     fileCounter = 0
     dataPaths = jsonExtractor(self.DataFile)
     for path, score in dataPaths.items():
         if score < 0.1374:
             filepath = path
             print(filepath)
             with open(filepath) as inFile:
                 fileCounter += 1
                 for lines in inFile:
                     processedLines = sent_tokenize(lines)
                     for line in processedLines:
                         lineAnalyzed = analyzer.analyze("DUMMY", line)
                         lineAfterAnalyzer = ""
                         for finalToken in lineAnalyzed:
                             lineAfterAnalyzer += " " + finalToken.getForm()
                         if len(lineAfterAnalyzer) == 0:
                             continue
                         path = Path(filepath)
                         file = path.parts[-1]
                         directory = path.parts[-2]
                         if not os.path.exists(os.path.join(self.corpusDestination, directory)):
                             os.makedirs(os.path.join(self.corpusDestination, directory))
                             with open(os.path.join(self.corpusDestination, directory, file), "a") as outFile:
                                 if lineAfterAnalyzer.endswith("\n"):
                                     lineAfterAnalyzer.replace("\n", "")
                                     outFile.write(lineAfterAnalyzer + "\n")
                                 else:
                                     outFile.write(lineAfterAnalyzer + "\n")
                         else:
                             with open(os.path.join(self.corpusDestination, directory, file), "a") as outFile:
                                 if lineAfterAnalyzer.endswith('\n'):
                                     lineAfterAnalyzer.replace("\n", "")
                                     outFile.write(lineAfterAnalyzer + "\n")
                                 else:
                                     outFile.write(lineAfterAnalyzer + "\n")
     print("\nTotal TXT processed = ", fileCounter)
    def phraseIndex(self):
        picklePath = Path(self.indexLocation)
        index = deserialize(os.path.join(picklePath, "PICKLE.PKL"))

        entryIdentifier = 0
        phraseEntry = {}
        phraseList = []

        docsPath = Path(self.docsLocation)
        dpDocs = jsonExtractor(docsPath)
        for dataPoint in dpDocs:
            dataPointName = dataPoint['name']
            tokenizedEntry = word_tokenize(dataPointName)
            encodedList = []
            for term in tokenizedEntry:
                for entry in index.indexDict.items():
                    word = entry[0]
                    if word.lower() == term.lower():
                        identifier = entry[1]['id']
                        encodedList.append(identifier)
                        break
                    else:
                        continue

            phraseEntry['dp-id'] = entryIdentifier
            phraseEntry['original-phrase'] = dataPointName
            phraseEntry['encoded-phrase'] = encodedList
            entryIdentifier += 1
            phraseList.append(dict(phraseEntry))

        phraseObject = json.dumps(phraseList, indent=4)

        phraseJsonPath = Path(self.indexJson)
        if not os.path.exists(phraseJsonPath):
            os.makedirs(phraseJsonPath)
        with open(os.path.join(phraseJsonPath, "PHRASE-INDEX.json"),
                  'w') as outfile:
            outfile.write(phraseObject)
    def synonymReducedPhrases(self):
        corpusPhrasesListCleaned = []
        picklePath = Path(self.indexLocation)
        index = deserialize(os.path.join(picklePath, "PICKLE.PKL"))

        corpusPhrasesPath = Path(
            os.path.join(self.tcLocation, "TC-phrases-bi-tri.txt"))
        corpusPhrasesList = loadStopWords(corpusPhrasesPath)
        for word in corpusPhrasesList:
            corpusPhrasesListCleaned.append(word.replace("_", " ").lower())

        jsonPhrasePath = Path(self.phraseLocation)
        jsonPhrase = jsonExtractor(
            os.path.join(jsonPhrasePath, "PHRASE-INDEX.json"))

        synonymsFullJsonPath = Path(self.synonymLocation)
        synonymsFullJson = jsonExtractor(
            os.path.join(synonymsFullJsonPath,
                         "word-synonyms-wordnet-reduced.json"))

        phraseSynonymEntry = {}
        phraseSynonymsList = []

        for phraseEntry in jsonPhrase:

            phraseAllMeanings = []
            encodedPhrase = phraseEntry['encoded-phrase']
            originalPhrase = phraseEntry['original-phrase']
            tokenizedOriginalPhrase = originalPhrase.split()
            tokenInCorpusFlag = False
            for token in tokenizedOriginalPhrase:
                for phrase in corpusPhrasesListCleaned:
                    brokenPhrase = phrase.split()
                    if token.lower() in brokenPhrase:
                        tokenInCorpusFlag = True
                        break

            if tokenInCorpusFlag == False:
                continue

            dpId = phraseEntry['dp-id']
            for encodedId in encodedPhrase:
                for synonymEntry in synonymsFullJson:
                    if synonymEntry['id'] == encodedId:
                        # COMMENT <TAG-001> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS
                        if len(synonymEntry['synonyms']) == 0:
                            reg = re.compile(r'[^A-Za-z]')
                            substitute = re.sub(reg, ' ', originalPhrase)
                            tokenizedPhraseInput = substitute.split()
                            phraseAllMeanings.append(tokenizedPhraseInput)
                        else:
                            # COMMENT </TAG-001> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS
                            phraseAllMeanings.append(synonymEntry['synonyms'])

            phraseSynonymEntry['dp-id'] = dpId
            phraseSynonymEntry['synonyms'] = list(
                itertools.product(*phraseAllMeanings))
            phraseSynonymsList.append(dict(phraseSynonymEntry))

        synonymJsonPath = Path(self.synonymLocation)
        if not os.path.exists(synonymJsonPath):
            os.makedirs(synonymJsonPath)
        phrasesReducedSynonyms = json.dumps(phraseSynonymsList, indent=4)

        with open(
                os.path.join(synonymJsonPath,
                             "phrase-synonyms-wordnet-reduced.json"),
                'w') as outfile:
            outfile.write(phrasesReducedSynonyms)
    w2vSynGenObj.synGenerator()


def synonymGeneration(config, wordsOrPhrasesParam):
    synonymObject = Synonyms(config)

    if (wordsOrPhrasesParam == "words"):
        synonymObject.synonymWords()
    elif (wordsOrPhrasesParam == "phrases"):
        synonymObject.synonymFullPhrases()
        synonymObject.synonymReducedPhrases()


if __name__ == "__main__":
    doWhat = sys.argv[1]
    config = jsonExtractor(sys.argv[2])
    print("CONFIG = ", config)
    if (doWhat == "Evaluate"):
        CorpusEvaluationStart = time.time()
        evaluation = evalCorpus(config)
        evaluation.eval()
        CorpusEvaluationEnd = time.time()
        totalTimeForCorpusEvaluation = CorpusEvaluationEnd - CorpusEvaluationStart
        print("Elaspsed Time for Corpus Evaluation = ",
              totalTimeForCorpusEvaluation, "seconds")

    elif (doWhat == "pre-process"):
        corpusCreationStart = time.time()
        preProcessedCorpusCreation(config)
        corpusCreationEnd = time.time()
        totalTimeForCorpus = corpusCreationEnd - corpusCreationStart