def vectorize_masked_tokens(document,
                            maskToken='',
                            keywordProcessor=None,
                            scoringMethod='euclidean',
                            disp=False):
    """
    Iteratively masks all knowledge tokens in document string and determines
    score relative to initial vector. Returns dict of token and score
        -document: string of the document to vectorize
        -maskToken: string to replace each token with
        -scoringMethod: use euclidean distance or dot product to determine distance from baseVec
        -disp: whether to display the bar chat of distances
    """
    # assertions and special conditions
    assert isinstance(document, str), "document must have type 'str'"
    assert isinstance(maskToken, str), "maskToken must have type 'str'"
    assert (scoringMethod
            in ['euclidean',
                'dot']), "scoringMethod must be 'euclidean' or 'dot'"

    if not keywordProcessor:
        keywordProcessor = build_keyword_processor(document.split())

    # define scoring method
    if (scoringMethod == 'euclidean'):

        def calc_score(maskedVec, baseVec):
            return euclidean(maskedVec, baseVec)
    elif (scoringMethod == 'dot'):

        def calc_score(maskedVec, baseVec):
            return np.sum(maskedVec * baseVec) / np.linalg.norm(baseVec)

    # calculate vector of raw document
    baseVec = docVecs.vectorize_doc(document)

    # find tokens in document with both greedy and non-greedy matching
    foundTokens = set(keywordProcessor.extract_keywords(document))

    scoreDict = {}

    for token in foundTokens:
        print(colored(f'\t{token}', 'red'), end=' | ')
        maskedDoc = re.sub(token, maskToken, document)
        maskedVec = docVecs.vectorize_doc(maskedDoc)
        score = calc_score(maskedVec, baseVec)
        print(colored(score, 'green'))
        scoreDict.update({token: score})

    if disp:
        plt.bar(scoreDict.keys(), scoreDict.values())
        plt.ylabel('Euclidean Distance from Base Vector')
        plt.title(f'Tokens Iteratively Replaced With "{maskToken}"')
        plt.show()

    return scoreDict
Exemple #2
0
def answer_question(question, text):
    questionVec = vectorize_doc(question)

    sentences = re.split(r'[.|!|\?]', text)

    bestDist, bestSentence = 20, "NONE"
    for sentence in sentences:
        if not sentence == "":
            sentenceVec = vectorize_doc(sentence)
            sentenceDist = cosine(questionVec, sentenceVec)
            if sentenceDist < bestDist:
                bestDist = sentenceDist
                bestSentence = sentence

    return bestSentence
Exemple #3
0
def vectorize_csv(filePath, delimiter=',', cleanFiles=False, outPath=None):
    """
    Builds a dataframe of vectors from csv where lines have form:
    'title DELIMITER text'.
        -filePath: path the the csv file to analyze
        -delimiter: the delimiter used to separate title and text
        -cleanFiles: true if the file text should be cleaned before vectorization
        -outPath: location at which to save the dataframe
    """
    fileList = []
    with open(filePath, 'r') as csvFile:
        for i, line in enumerate(csvFile):
            print(f'Building dataframe: {i}', end='\r')
            # only the first instace of the delimiter is used to split
            delimLoc = line.find(delimiter)
            title, rawText = line[:delimLoc], line[delimLoc:]
            cleanText = clean_text(rawText) if cleanFiles else rawText
            textVector = docVecs.vectorize_doc(cleanText)
            textDict = vec_to_dict(textVector)
            textDict.update({'file': title})
            fileList.append(textDict)
    # build dataframe from text vectors
    dataframe = pd.DataFrame(fileList)
    # save to outPath if prompted
    if outPath:
        dataframe.to_pickle(outPath)
    return dataframe
Exemple #4
0
    def retrieve(self, question, n, cutoff=None):
        """ Retrieves top n files stored in retriver object """
        # cutoff  is infinite if none is given
        if not cutoff:
            cutoff = inf

        questionVec = vectorize_doc(question)

        scoresList = []

        counterList = ['-', '/', '+', '\\', '+', '|']
        counterLen = len(counterList)
        counter = 0

        for docTitle, docVec in (self.documents).items():
            docDist = euclidean(questionVec, docVec)
            scoresList.append(((1 / docDist), docTitle))
            if docDist < cutoff:
                docDiff = np.subtract(questionVec, docVec)
                diffDict = datasetVectorizer.vec_to_dict(docDiff)
                diffDf = pd.DataFrame([diffDict])
                prediction = self.model.predict(diffDf)[0]
                if prediction > 0.6:
                    scoresList.append((prediction, docTitle))
            print(f'\tSearching: {counterList[counter % counterLen]}',
                  end='\r')
            counter += 1
        scoresList.sort(reverse=True)
        return [scoreTuple[1] for scoreTuple in scoresList[:n]]
Exemple #5
0
 def __init__(self, wordSet=None, inPath=None, outPath=None):
     if wordSet:
         self.data = {word: docVecs.vectorize_doc(word) for word in wordSet}
     elif inPath:
         self.data = load(inPath)
     else:
         raise ValueError('Valid load data must be given.')
     if outPath:
         save(self.data, outPath)
     print(f'{len(self.data)} words vectorized and loaded.')
Exemple #6
0
 def build_file_dict(file, i):
     """ Helper to build a dict storing file vector and name """
     with open(f'{folderPath}/{file}', 'r') as fileObj:
         print(f'\tBuilding dataframe: {i}', end='\r')
         # clean the text before vectorizing if cleanFiles
         text = clean_text(fileObj.read()) if cleanFiles else fileObj.read()
         # vectorize text and convert vector to dict
         fileVector = docVecs.vectorize_doc(text)
         fileDict = vec_to_dict(fileVector)
         # add file name to the dict and return
         fileDict.update({'file': file})
         return fileDict
def bert_multiParser(inStr):
    expression = re.findall(r'(?<=\().+(?=\))', inStr)
    subExpressions = find_subs(expression[0])
    if len(subExpressions) == 1:
        return docVecs.vectorize_doc(subExpressions[0])
    elif len(subExpressions) == 2:
        left = bert_multiParser(subExpressions[0])
        right = bert_multiParser(subExpressions[1])
        leftEnd = (expression[0].find(subExpressions[0])) + len(
            subExpressions[0])
        rightStart = expression[0].find(subExpressions[1])
        operator = re.findall(r'[?|==|\-|\+]',
                              expression[0][leftEnd:rightStart])
        return bert_binop(left, right, operator[0])
    def scrape_wiki_file(file):
        """ Helper pulls information out of wiki file and returns dict """
        if not file.endswith('.jsonl'):
            return []
        else:
            print(colored(f'Analyzing: "{file}"', 'cyan'))
            fileData = []
            with open(f'{path}/{file}', 'r') as questionFile:
                for i, questionDict in enumerate(
                        json_lines.reader(questionFile)):
                    if i >= n:
                        break
                    print(colored(f'\tReading Questions: {i}', 'yellow'),
                          end='\r')
                    try:
                        # get question text and vectorize
                        questionText = questionDict['question_text']
                        questionVec = vectorize_doc(questionText)

                        # print(f"-{questionText}")
                        #
                        # # get list of start locations for each long answer candidate
                        # answerInfo      =   questionDict['annotations'][0]
                        # shortAnswerInfo =   answerInfo['short_answers']
                        # pageTokens      =   questionDict['document_tokens']
                        #
                        #
                        # if not (shortAnswerInfo==[]):
                        #     shortStart  =   shortAnswerInfo[0]['start_token']
                        #     shortEnd    =   shortAnswerInfo[0]['end_token']
                        #     answerWords =   " ".join(tokenDict['token']
                        #                         for tokenDict in pageTokens[shortStart:shortEnd])
                        #     print(answerWords)
                        # else:
                        #     print('non')

                        # pageText = " ".join(tokenDict['token'] for tokenDict in )

                        curColumnDict = {
                            'query': questionText,
                            'vec': questionVec,
                            'score': 1
                        }

                        fileData.append(curColumnDict)

                    except Exception as e:
                        print(colored(f'\tException: {e}', 'red'))

                return fileData
def bert_parser(inStr):
    tokens = inStr.split()
    tokenNum = len(tokens)
    if (tokenNum == 0):
        raise ValueError("Cannot parse empty string.")
    elif (tokenNum == 1):
        cleanToken = re.sub(unopMatcher, '', tokens[0])
        return docVecs.vectorize_doc(cleanToken)
    else:
        evaluateTokens = re.findall(r'(?<=\()[^)]+(?=\))', inStr)
        vectorizedTokens = {
            token: docVecs.vectorize_doc(token)
            for token in evaluateTokens
        }
        for i, token in enumerate(tokens):
            cleanToken = re.sub(unopMatcher, '', token)
            if cleanToken in vectorizedTokens:
                tokens[i] = vectorizedTokens[cleanToken]
        for i in range(0, len(tokens), 3):
            token1 = tokens[i]
            token2 = tokens[i + 2]
            operator = tokens[i + 1]
            print(token1, token2, operator)
            print(bert_binop(token1, token2, operator))
def bert_arthimetic(inStr):
    """ inStr must have form 'TERM_1 [+|-] TERM_2 '"""
    splitStrs = inStr.split()
    numTokens = len(splitStrs)

    if (numTokens == 1):
        return docVecs.vectorize_doc(splitStrs)

    elif (numTokens == 3):
        operator = splitStrs[1]
        vecs = docVecs.vectorize_doc_list([splitStrs[0], splitStrs[2]])
        # identify arthimetic method
        if (operator == '+'):
            return np.add(vecs[0], vecs[1])
        elif (operator == '-'):
            return np.subtract(vecs[0], vecs[1])
        else:
            raise ValueError('Invalid operator')
Exemple #11
0
 def __init__(self, documentsCsv, n, sep=','):
     """
     Initializes retreiver object with vectorized files
     """
     documentsDict = {}
     with open(documentsCsv, 'r') as documentsFile:
         for i, line in enumerate(documentsFile):
             if i > n:
                 break
             print(f'Reading File: {i}', end='\r')
             sepLoc = line.find(sep)
             title = line[:sepLoc]
             text = line[sepLoc:]
             textVec = vectorize_doc(text)
             documentsDict.update({title: textVec})
     self.documents = documentsDict
     self.model = load_model(
         'data/outData/models/documentRetrievalModel.sav')
def create_fake_queries(queryNum, queryLen=5, outPath=None):
    """
    Build dataframe of freq-weighted random word queries of
    len==(normal around queryLen) and thier vectors
    """
    def unzip_freqDict(d):
        tokens, freqs = [], []
        for key, val in d.items():
            tokens.append(key)
            freqs.append(val[0])
        return tokens, np.array(freqs)

    tokens, freqs = unzip_freqDict(freqDict)
    freqs /= freqs.sum()

    words = np.random.choice(tokens, size=queryNum, replace=True, p=freqs)
    chunks = [
        max(int(chunk), 1)
        for chunk in np.random.normal(queryLen, size=queryNum)
    ]
    print(len(words), len(chunks))

    queryList = []

    for i in range(len(words)):
        print(f'Building: {i}', end='\r')
        chunkSteps = chunks[i]
        if ((queryNum - i) > chunkSteps):
            sent = " ".join(words[i:(i + chunkSteps)])
        else:
            sent = " ".join(words[i:(i + (queryNum - i))])
        vec = vectorize_doc(sent)
        queryList.append({'query': sent, 'vec': vec, 'score': 0})

    queryDf = pd.DataFrame(queryList)

    if outPath:
        queryDf.to_pickle(outPath)

    return queryDf
def analyze_text(text):
    textVec = docVecs.vectorize_doc(text)
    vecDict = [vec_to_dict(textVec)]
    df = pd.DataFrame(vecDict)
    prediction = sentimentModel.predict(df)
    print(prediction)