def vectorize_masked_tokens(document, maskToken='', keywordProcessor=None, scoringMethod='euclidean', disp=False): """ Iteratively masks all knowledge tokens in document string and determines score relative to initial vector. Returns dict of token and score -document: string of the document to vectorize -maskToken: string to replace each token with -scoringMethod: use euclidean distance or dot product to determine distance from baseVec -disp: whether to display the bar chat of distances """ # assertions and special conditions assert isinstance(document, str), "document must have type 'str'" assert isinstance(maskToken, str), "maskToken must have type 'str'" assert (scoringMethod in ['euclidean', 'dot']), "scoringMethod must be 'euclidean' or 'dot'" if not keywordProcessor: keywordProcessor = build_keyword_processor(document.split()) # define scoring method if (scoringMethod == 'euclidean'): def calc_score(maskedVec, baseVec): return euclidean(maskedVec, baseVec) elif (scoringMethod == 'dot'): def calc_score(maskedVec, baseVec): return np.sum(maskedVec * baseVec) / np.linalg.norm(baseVec) # calculate vector of raw document baseVec = docVecs.vectorize_doc(document) # find tokens in document with both greedy and non-greedy matching foundTokens = set(keywordProcessor.extract_keywords(document)) scoreDict = {} for token in foundTokens: print(colored(f'\t{token}', 'red'), end=' | ') maskedDoc = re.sub(token, maskToken, document) maskedVec = docVecs.vectorize_doc(maskedDoc) score = calc_score(maskedVec, baseVec) print(colored(score, 'green')) scoreDict.update({token: score}) if disp: plt.bar(scoreDict.keys(), scoreDict.values()) plt.ylabel('Euclidean Distance from Base Vector') plt.title(f'Tokens Iteratively Replaced With "{maskToken}"') plt.show() return scoreDict
def answer_question(question, text): questionVec = vectorize_doc(question) sentences = re.split(r'[.|!|\?]', text) bestDist, bestSentence = 20, "NONE" for sentence in sentences: if not sentence == "": sentenceVec = vectorize_doc(sentence) sentenceDist = cosine(questionVec, sentenceVec) if sentenceDist < bestDist: bestDist = sentenceDist bestSentence = sentence return bestSentence
def vectorize_csv(filePath, delimiter=',', cleanFiles=False, outPath=None): """ Builds a dataframe of vectors from csv where lines have form: 'title DELIMITER text'. -filePath: path the the csv file to analyze -delimiter: the delimiter used to separate title and text -cleanFiles: true if the file text should be cleaned before vectorization -outPath: location at which to save the dataframe """ fileList = [] with open(filePath, 'r') as csvFile: for i, line in enumerate(csvFile): print(f'Building dataframe: {i}', end='\r') # only the first instace of the delimiter is used to split delimLoc = line.find(delimiter) title, rawText = line[:delimLoc], line[delimLoc:] cleanText = clean_text(rawText) if cleanFiles else rawText textVector = docVecs.vectorize_doc(cleanText) textDict = vec_to_dict(textVector) textDict.update({'file': title}) fileList.append(textDict) # build dataframe from text vectors dataframe = pd.DataFrame(fileList) # save to outPath if prompted if outPath: dataframe.to_pickle(outPath) return dataframe
def retrieve(self, question, n, cutoff=None): """ Retrieves top n files stored in retriver object """ # cutoff is infinite if none is given if not cutoff: cutoff = inf questionVec = vectorize_doc(question) scoresList = [] counterList = ['-', '/', '+', '\\', '+', '|'] counterLen = len(counterList) counter = 0 for docTitle, docVec in (self.documents).items(): docDist = euclidean(questionVec, docVec) scoresList.append(((1 / docDist), docTitle)) if docDist < cutoff: docDiff = np.subtract(questionVec, docVec) diffDict = datasetVectorizer.vec_to_dict(docDiff) diffDf = pd.DataFrame([diffDict]) prediction = self.model.predict(diffDf)[0] if prediction > 0.6: scoresList.append((prediction, docTitle)) print(f'\tSearching: {counterList[counter % counterLen]}', end='\r') counter += 1 scoresList.sort(reverse=True) return [scoreTuple[1] for scoreTuple in scoresList[:n]]
def __init__(self, wordSet=None, inPath=None, outPath=None): if wordSet: self.data = {word: docVecs.vectorize_doc(word) for word in wordSet} elif inPath: self.data = load(inPath) else: raise ValueError('Valid load data must be given.') if outPath: save(self.data, outPath) print(f'{len(self.data)} words vectorized and loaded.')
def build_file_dict(file, i): """ Helper to build a dict storing file vector and name """ with open(f'{folderPath}/{file}', 'r') as fileObj: print(f'\tBuilding dataframe: {i}', end='\r') # clean the text before vectorizing if cleanFiles text = clean_text(fileObj.read()) if cleanFiles else fileObj.read() # vectorize text and convert vector to dict fileVector = docVecs.vectorize_doc(text) fileDict = vec_to_dict(fileVector) # add file name to the dict and return fileDict.update({'file': file}) return fileDict
def bert_multiParser(inStr): expression = re.findall(r'(?<=\().+(?=\))', inStr) subExpressions = find_subs(expression[0]) if len(subExpressions) == 1: return docVecs.vectorize_doc(subExpressions[0]) elif len(subExpressions) == 2: left = bert_multiParser(subExpressions[0]) right = bert_multiParser(subExpressions[1]) leftEnd = (expression[0].find(subExpressions[0])) + len( subExpressions[0]) rightStart = expression[0].find(subExpressions[1]) operator = re.findall(r'[?|==|\-|\+]', expression[0][leftEnd:rightStart]) return bert_binop(left, right, operator[0])
def scrape_wiki_file(file): """ Helper pulls information out of wiki file and returns dict """ if not file.endswith('.jsonl'): return [] else: print(colored(f'Analyzing: "{file}"', 'cyan')) fileData = [] with open(f'{path}/{file}', 'r') as questionFile: for i, questionDict in enumerate( json_lines.reader(questionFile)): if i >= n: break print(colored(f'\tReading Questions: {i}', 'yellow'), end='\r') try: # get question text and vectorize questionText = questionDict['question_text'] questionVec = vectorize_doc(questionText) # print(f"-{questionText}") # # # get list of start locations for each long answer candidate # answerInfo = questionDict['annotations'][0] # shortAnswerInfo = answerInfo['short_answers'] # pageTokens = questionDict['document_tokens'] # # # if not (shortAnswerInfo==[]): # shortStart = shortAnswerInfo[0]['start_token'] # shortEnd = shortAnswerInfo[0]['end_token'] # answerWords = " ".join(tokenDict['token'] # for tokenDict in pageTokens[shortStart:shortEnd]) # print(answerWords) # else: # print('non') # pageText = " ".join(tokenDict['token'] for tokenDict in ) curColumnDict = { 'query': questionText, 'vec': questionVec, 'score': 1 } fileData.append(curColumnDict) except Exception as e: print(colored(f'\tException: {e}', 'red')) return fileData
def bert_parser(inStr): tokens = inStr.split() tokenNum = len(tokens) if (tokenNum == 0): raise ValueError("Cannot parse empty string.") elif (tokenNum == 1): cleanToken = re.sub(unopMatcher, '', tokens[0]) return docVecs.vectorize_doc(cleanToken) else: evaluateTokens = re.findall(r'(?<=\()[^)]+(?=\))', inStr) vectorizedTokens = { token: docVecs.vectorize_doc(token) for token in evaluateTokens } for i, token in enumerate(tokens): cleanToken = re.sub(unopMatcher, '', token) if cleanToken in vectorizedTokens: tokens[i] = vectorizedTokens[cleanToken] for i in range(0, len(tokens), 3): token1 = tokens[i] token2 = tokens[i + 2] operator = tokens[i + 1] print(token1, token2, operator) print(bert_binop(token1, token2, operator))
def bert_arthimetic(inStr): """ inStr must have form 'TERM_1 [+|-] TERM_2 '""" splitStrs = inStr.split() numTokens = len(splitStrs) if (numTokens == 1): return docVecs.vectorize_doc(splitStrs) elif (numTokens == 3): operator = splitStrs[1] vecs = docVecs.vectorize_doc_list([splitStrs[0], splitStrs[2]]) # identify arthimetic method if (operator == '+'): return np.add(vecs[0], vecs[1]) elif (operator == '-'): return np.subtract(vecs[0], vecs[1]) else: raise ValueError('Invalid operator')
def __init__(self, documentsCsv, n, sep=','): """ Initializes retreiver object with vectorized files """ documentsDict = {} with open(documentsCsv, 'r') as documentsFile: for i, line in enumerate(documentsFile): if i > n: break print(f'Reading File: {i}', end='\r') sepLoc = line.find(sep) title = line[:sepLoc] text = line[sepLoc:] textVec = vectorize_doc(text) documentsDict.update({title: textVec}) self.documents = documentsDict self.model = load_model( 'data/outData/models/documentRetrievalModel.sav')
def create_fake_queries(queryNum, queryLen=5, outPath=None): """ Build dataframe of freq-weighted random word queries of len==(normal around queryLen) and thier vectors """ def unzip_freqDict(d): tokens, freqs = [], [] for key, val in d.items(): tokens.append(key) freqs.append(val[0]) return tokens, np.array(freqs) tokens, freqs = unzip_freqDict(freqDict) freqs /= freqs.sum() words = np.random.choice(tokens, size=queryNum, replace=True, p=freqs) chunks = [ max(int(chunk), 1) for chunk in np.random.normal(queryLen, size=queryNum) ] print(len(words), len(chunks)) queryList = [] for i in range(len(words)): print(f'Building: {i}', end='\r') chunkSteps = chunks[i] if ((queryNum - i) > chunkSteps): sent = " ".join(words[i:(i + chunkSteps)]) else: sent = " ".join(words[i:(i + (queryNum - i))]) vec = vectorize_doc(sent) queryList.append({'query': sent, 'vec': vec, 'score': 0}) queryDf = pd.DataFrame(queryList) if outPath: queryDf.to_pickle(outPath) return queryDf
def analyze_text(text): textVec = docVecs.vectorize_doc(text) vecDict = [vec_to_dict(textVec)] df = pd.DataFrame(vecDict) prediction = sentimentModel.predict(df) print(prediction)