Example #1
0
def createEngine(engineOptions):
    picklePath = Path(engineOptions["index-file"])
    index = deserialize(os.path.join(str(picklePath), "PICKLE.PKL"))
    if (engineOptions["type"] == "boolean"):
        return BooleanEngine(index)
    elif (engineOptions["type"] == "tf-idf"):
        return TfIdfEngine(index, engineOptions["n"])
    elif (engineOptions["type"] == "bm25"):
        return Bm25Engine(index, engineOptions["n"], engineOptions["k"], engineOptions["b"])
    else:
        raise ValueError("bad engine option")
    def synonymWords(self):
        uniqueWordsPath = Path(os.path.join(self.tcLocation, "TC-words.txt"))
        uniqueWordsList = loadStopWords(uniqueWordsPath)

        picklePath = Path(self.indexLocation)
        index = deserialize(os.path.join(picklePath, "PICKLE.PKL"))

        wordSynonymFull = []
        wordSynonymReduced = []

        wordSynonymDictionaryFull = {}
        wordSynonymDictionaryReduced = {}

        for entry in index.indexDict.items():
            word = entry[0]
            identifier = entry[1]['id']

            synonymSet = wordnet.synsets(word)
            i = 0
            uniqueSynonyms = set()
            while i < len(synonymSet):
                uniqueSynonyms.add(synonymSet[i].lemmas()[0].name())
                i += 1
                filteredSynonyms = uniqueSynonyms.copy()
            for word in uniqueSynonyms:
                if word not in uniqueWordsList:
                    filteredSynonyms.remove(word)

            wordSynonymDictionaryFull['id'] = identifier
            wordSynonymDictionaryFull['synonyms'] = list(uniqueSynonyms)
            wordSynonymFull.append(dict(wordSynonymDictionaryFull))

            wordSynonymDictionaryReduced['id'] = identifier
            wordSynonymDictionaryReduced['synonyms'] = list(filteredSynonyms)
            wordSynonymReduced.append(dict(wordSynonymDictionaryReduced))

        synonymJsonPath = Path(self.synonymLocation)
        synonymFull = json.dumps(wordSynonymFull, indent=4)
        synonymReduced = json.dumps(wordSynonymReduced, indent=4)

        if not os.path.exists(synonymJsonPath):
            os.makedirs(synonymJsonPath)

        with open(
                os.path.join(synonymJsonPath,
                             "word-synonyms-wordnet-full.json"),
                'w') as outfile:
            outfile.write(synonymFull)
        with open(
                os.path.join(synonymJsonPath,
                             "word-synonyms-wordnet-reduced.json"),
                'w') as outfile:
            outfile.write(synonymReduced)
Example #3
0
    def deserializeResponse(self, response):
        # Variable Definition
        response = Utilities.deserialize(response)  # Deserialize response

        # Code Section
        # Set print color according to response status
        color = PrintColors.OKGREEN if response["status"] else PrintColors.FAIL

        Utilities.logger(color + response["message"])  # Print response
        Utilities.logger(PrintColors.RESET)

        return response["status"]
    def synonymFullPhrases(self):
        corpusPhrasesListCleaned = []
        picklePath = Path(self.indexLocation)
        index = deserialize(os.path.join(picklePath, "PICKLE.PKL"))

        jsonPhrasePath = Path(self.phraseLocation)
        jsonPhrase = jsonExtractor(
            os.path.join(jsonPhrasePath, "PHRASE-INDEX.json"))

        synonymsFullJsonPath = Path(self.synonymLocation)
        synonymsFullJson = jsonExtractor(
            os.path.join(synonymsFullJsonPath,
                         "word-synonyms-wordnet-full.json"))

        phraseSynonymEntry = {}
        phraseSynonymsList = []

        for phraseEntry in jsonPhrase:

            phraseAllMeanings = []
            encodedPhrase = phraseEntry['encoded-phrase']
            originalPhrase = phraseEntry['original-phrase']
            dpId = phraseEntry['dp-id']
            for encodedId in encodedPhrase:
                for synonymEntry in synonymsFullJson:
                    if synonymEntry['id'] == encodedId:
                        # COMMENT <TAG-002> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS
                        if len(synonymEntry['synonyms']) == 0:
                            reg = re.compile(r'[^A-Za-z]')
                            substitute = re.sub(reg, ' ', originalPhrase)
                            tokenizedPhraseInput = substitute.split()
                            phraseAllMeanings.append(tokenizedPhraseInput)
                        else:
                            # COMMENT </TAG-002> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS
                            phraseAllMeanings.append(synonymEntry['synonyms'])

            phraseSynonymEntry['dp-id'] = dpId
            phraseSynonymEntry['synonyms'] = list(
                itertools.product(*phraseAllMeanings))
            phraseSynonymsList.append(dict(phraseSynonymEntry))

        synonymJsonPath = Path(self.synonymLocation)
        if not os.path.exists(synonymJsonPath):
            os.makedirs(synonymJsonPath)
        phrasesFullSynonyms = json.dumps(phraseSynonymsList, indent=4)

        with open(
                os.path.join(synonymJsonPath,
                             "phrase-synonyms-wordnet-full.json"),
                'w') as outfile:
            outfile.write(phrasesFullSynonyms)
    def phraseIndex(self):
        picklePath = Path(self.indexLocation)
        index = deserialize(os.path.join(picklePath, "PICKLE.PKL"))

        entryIdentifier = 0
        phraseEntry = {}
        phraseList = []

        docsPath = Path(self.docsLocation)
        dpDocs = jsonExtractor(docsPath)
        for dataPoint in dpDocs:
            dataPointName = dataPoint['name']
            tokenizedEntry = word_tokenize(dataPointName)
            encodedList = []
            for term in tokenizedEntry:
                for entry in index.indexDict.items():
                    word = entry[0]
                    if word.lower() == term.lower():
                        identifier = entry[1]['id']
                        encodedList.append(identifier)
                        break
                    else:
                        continue

            phraseEntry['dp-id'] = entryIdentifier
            phraseEntry['original-phrase'] = dataPointName
            phraseEntry['encoded-phrase'] = encodedList
            entryIdentifier += 1
            phraseList.append(dict(phraseEntry))

        phraseObject = json.dumps(phraseList, indent=4)

        phraseJsonPath = Path(self.indexJson)
        if not os.path.exists(phraseJsonPath):
            os.makedirs(phraseJsonPath)
        with open(os.path.join(phraseJsonPath, "PHRASE-INDEX.json"),
                  'w') as outfile:
            outfile.write(phraseObject)
    def synonymReducedPhrases(self):
        corpusPhrasesListCleaned = []
        picklePath = Path(self.indexLocation)
        index = deserialize(os.path.join(picklePath, "PICKLE.PKL"))

        corpusPhrasesPath = Path(
            os.path.join(self.tcLocation, "TC-phrases-bi-tri.txt"))
        corpusPhrasesList = loadStopWords(corpusPhrasesPath)
        for word in corpusPhrasesList:
            corpusPhrasesListCleaned.append(word.replace("_", " ").lower())

        jsonPhrasePath = Path(self.phraseLocation)
        jsonPhrase = jsonExtractor(
            os.path.join(jsonPhrasePath, "PHRASE-INDEX.json"))

        synonymsFullJsonPath = Path(self.synonymLocation)
        synonymsFullJson = jsonExtractor(
            os.path.join(synonymsFullJsonPath,
                         "word-synonyms-wordnet-reduced.json"))

        phraseSynonymEntry = {}
        phraseSynonymsList = []

        for phraseEntry in jsonPhrase:

            phraseAllMeanings = []
            encodedPhrase = phraseEntry['encoded-phrase']
            originalPhrase = phraseEntry['original-phrase']
            tokenizedOriginalPhrase = originalPhrase.split()
            tokenInCorpusFlag = False
            for token in tokenizedOriginalPhrase:
                for phrase in corpusPhrasesListCleaned:
                    brokenPhrase = phrase.split()
                    if token.lower() in brokenPhrase:
                        tokenInCorpusFlag = True
                        break

            if tokenInCorpusFlag == False:
                continue

            dpId = phraseEntry['dp-id']
            for encodedId in encodedPhrase:
                for synonymEntry in synonymsFullJson:
                    if synonymEntry['id'] == encodedId:
                        # COMMENT <TAG-001> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS
                        if len(synonymEntry['synonyms']) == 0:
                            reg = re.compile(r'[^A-Za-z]')
                            substitute = re.sub(reg, ' ', originalPhrase)
                            tokenizedPhraseInput = substitute.split()
                            phraseAllMeanings.append(tokenizedPhraseInput)
                        else:
                            # COMMENT </TAG-001> TO ALLOW EMPTY SYNONYMS FOR SPECIAL CASES LIKE KEYWORDS
                            phraseAllMeanings.append(synonymEntry['synonyms'])

            phraseSynonymEntry['dp-id'] = dpId
            phraseSynonymEntry['synonyms'] = list(
                itertools.product(*phraseAllMeanings))
            phraseSynonymsList.append(dict(phraseSynonymEntry))

        synonymJsonPath = Path(self.synonymLocation)
        if not os.path.exists(synonymJsonPath):
            os.makedirs(synonymJsonPath)
        phrasesReducedSynonyms = json.dumps(phraseSynonymsList, indent=4)

        with open(
                os.path.join(synonymJsonPath,
                             "phrase-synonyms-wordnet-reduced.json"),
                'w') as outfile:
            outfile.write(phrasesReducedSynonyms)
if TO_SERIALIZE:

    raw_data = read_data(TRAIN_DIR, nfiles=c_nfiles)

    # Build the dataset and the dictionaries form the raw data
    data, dictionary, reverse_dictionary = build_dataset(
        raw_data, VOCABULARY_SIZE)

    del raw_data  # To reduce memory.

    # To avoid reading the whole dataset again an again
    print("Serializing the data.")
    serialize(DATA_DICT_SERIALIZATION, (data, dictionary, reverse_dictionary))
else:
    print("Reading serialization: ")
    data, dictionary, reverse_dictionary = deserialize(DATA_DICT_SERIALIZATION)

# Stores some informations about the actual test configuration
configuration = 'BATCH_SIZE: ' + str(BATCH_SIZE) + ' EMBEDDING_SIZE: ' + str(EMBEDDING_SIZE) + ' WINDOW_SIZE: ' \
                + str(WINDOW_SIZE) + ' VOCABULARY_SIZE ' + str(VOCABULARY_SIZE) + ' nfiles: ' + str(c_nfiles) + \
                ' stopwd: ' + str(c_stopwd) + ' shuffle_docs: ' + str(c_shuffle_docs) + ' dataset_size ' + str(len(data))

print("CONFIG: " + configuration)

# Read the question file for the Analogical Reasoning evaluation
questions = read_analogies(ANALOGIES_FILE, dictionary)

# ------------------------------------------ MODEL DEFINITION --------------------------------------------------------

graph = tf.Graph()
evall = None