예제 #1
0
    def saveCharFeatures(self, fileOut):
        feature = Feature()
        if len(self.charEmb) == 0:
            self.loadCharEmbeddings()
        xuidPairs = self.getAllXUIDPairs()
        xuids = set()
        for (xuid1, xuid2) in xuidPairs:
            xuids.add(xuid1)
            xuids.add(xuid2)
        for xuid in xuids:
            charEmb = []
            numCharsFound = 0
            for t in self.corpus.XUIDToMention[xuid].tokens:
                lemma = self.getBestStanToken(t.stanTokens).lemma.lower()
                for char in lemma:
                    if char == "ô":
                        char = "o"
                    if char in self.charEmb:
                        if numCharsFound == 20:
                            break
                        else:
                            charEmb += self.charEmb[char]
                            numCharsFound += 1
                    else:
                        print("* WARNING: we don't have char:", str(char),
                              "of len:", len(char))
                        #exit(1)
            while len(charEmb) < 400:  # 20 chars * 20 dim
                charEmb.append(0.0)
            feature.setSingle(self.corpus.XUIDToMention[xuid].UID, charEmb)

        # go through all pairs to compute relational data
        if self.saveRelationalFeatures:
            proc = 0
            completed = set()
            for xuid1, xuid2 in xuidPairs:
                uid1, uid2 = sorted([
                    self.corpus.XUIDToMention[xuid1].UID,
                    self.corpus.XUIDToMention[xuid2].UID
                ])
                if (uid1, uid2) in completed or (uid2, uid1) in completed:
                    continue
                completed.add((uid1, uid2))
                flatv1 = feature.singles[uid1]
                flatv2 = feature.singles[uid2]
                (dp, cs) = self.getDPCS(flatv1, flatv2)
                feature.addRelational(uid1, uid2, dp)
                feature.addRelational(uid1, uid2, cs)
                if proc % 1000 == 0:
                    print("\tprocessed",
                          proc,
                          "of",
                          len(xuidPairs),
                          "(%2.2f)" % float(100.0 * proc / len(xuidPairs)),
                          end="\r")
                proc += 1
        pickle_out = open(fileOut, 'wb')
        pickle.dump(feature, pickle_out)
예제 #2
0
    def saveWordNetFeatures(self, fileOut):
        feature = Feature()

        synSynToScore = {}
        xuidPairs = self.getAllXUIDPairs()
        print("calculating wordnet features for", len(xuidPairs),
              "unique pairs")
        i = 0
        completed = set()
        for xuid1, xuid2 in xuidPairs:
            uid1 = self.corpus.XUIDToMention[xuid1].UID
            uid2 = self.corpus.XUIDToMention[xuid2].UID
            if (uid1, uid2) in completed or (uid2, uid1) in completed:
                continue
            completed.add((uid1, uid2))
            textTokens1 = self.corpus.XUIDToMention[xuid1].text
            textTokens2 = self.corpus.XUIDToMention[xuid2].text
            bestScore = -1
            for t1 in textTokens1:
                syn1 = wn.synsets(t1)
                if len(syn1) == 0:
                    continue
                syn1 = syn1[0]
                for t2 in textTokens2:
                    syn2 = wn.synsets(t2)
                    if len(syn2) == 0:
                        continue
                    syn2 = syn2[0]
                    curScore = -1
                    if (syn1, syn2) in synSynToScore:
                        curScore = synSynToScore[(syn1, syn2)]
                    elif (syn2, syn1) in synSynToScore:
                        curScore = synSynToScore[(syn2, syn1)]
                    else:  # calculate it
                        curScore = wn.wup_similarity(syn1, syn2)
                        # don't want to store tons.  look-up is cheap
                        synSynToScore[(syn1, syn2)] = curScore
                        if curScore != None and curScore > bestScore:
                            bestScore = curScore

            feature.addRelational(uid1, uid2, bestScore)
            i += 1
            if i % 1000 == 0:
                print("\tprocessed",
                      i,
                      "of",
                      len(xuidPairs),
                      "(%2.2f)" % float(100.0 * i / len(xuidPairs)),
                      end="\r")

        pickle_out = open(fileOut, 'wb')
        pickle.dump(feature, pickle_out)
        print("")
예제 #3
0
    def saveLemmaFeatures(self, fileOut):
        feature = Feature()
        if len(self.gloveEmb) == 0:  # don't want to wastefully load again
            self.loadGloveEmbeddings()
        xuidPairs = self.getAllXUIDPairs()
        xuids = set()
        for (xuid1, xuid2) in xuidPairs:
            xuids.add(xuid1)
            xuids.add(xuid2)
        for xuid in xuids:

            sumEmb = [0] * 300
            for t in self.corpus.XUIDToMention[xuid].tokens:
                lemma = self.getBestStanToken(t.stanTokens).lemma.lower()
                if lemma not in self.gloveEmb:
                    print("* ERROR: no emb for", lemma)
                    continue
                curEmb = self.gloveEmb[lemma]
                sumEmb = [x + y for x, y in zip(sumEmb, curEmb)]
            #print("saving lemma for:", xuid, ": (", self.corpus.XUIDToMention[xuid].UID, ")")
            feature.setSingle(self.corpus.XUIDToMention[xuid].UID, sumEmb)

        if self.saveRelationalFeatures:
            # go through all pairs to compute relational data
            proc = 0
            completed = set()
            for xuid1, xuid2 in xuidPairs:
                uid1, uid2 = sorted([
                    self.corpus.XUIDToMention[xuid1].UID,
                    self.corpus.XUIDToMention[xuid2].UID
                ])
                if (uid1, uid2) in completed or (uid2, uid1) in completed:
                    continue
                completed.add((uid1, uid2))
                flatv1 = feature.singles[uid1]
                flatv2 = feature.singles[uid2]

                (dp, cs) = self.getDPCS(flatv1, flatv2)
                feature.addRelational(uid1, uid2, dp)
                feature.addRelational(uid1, uid2, cs)
                if proc % 1000 == 0:
                    print("\tprocessed",
                          proc,
                          "of",
                          len(xuidPairs),
                          "(%2.2f)" % float(100.0 * proc / len(xuidPairs)),
                          end="\r")
                proc += 1
        pickle_out = open(fileOut, 'wb')
        pickle.dump(feature, pickle_out)
예제 #4
0
    def saveBoWFeatures(self, fileOut):
        feature = Feature()
        if len(self.gloveEmb) == 0:  # don't want to wastefully load again
            self.loadGloveEmbeddings()
        xuidPairs = self.getAllXUIDPairs()
        xuids = set()
        for (xuid1, xuid2) in xuidPairs:
            xuids.add(xuid1)
            xuids.add(xuid2)

        #uidToVector = {} # will pickle
        #uiduidToFeature = {} # will pickle
        # gets a vector for each mention
        for xuid in xuids:
            t_startIndex = 99999999
            t_endIndex = -1
            doc_id = self.corpus.XUIDToMention[xuid].doc_id
            for t in self.corpus.XUIDToMention[xuid].tokens:
                ind = self.corpus.corpusTokensToCorpusIndex[t]
                if ind < t_startIndex:
                    t_startIndex = ind
                if ind > t_endIndex:
                    t_endIndex = ind
            # the N tokens before and after, only 0'ing if it's part
            # of a diff document
            tmpTokens = []
            for i in range(self.bowWindow):
                ind = t_startIndex - self.bowWindow + i
                cur_t = self.corpus.corpusTokens[ind]
                found = False
                if ind >= 0 and cur_t.doc_id == doc_id and cur_t.text.rstrip(
                ) != "":
                    #print("a:", cur_t.text, cur_t.doc_id, cur_t.sentenceNum,"cur:",self.corpus.corpusTokens[t_startIndex - 3], ",", self.corpus.corpusTokens[t_startIndex - 2], ",", self.corpus.corpusTokens[t_startIndex - 1])
                    cleanedText = self.removeQuotes(cur_t.text)
                    if cleanedText in self.gloveEmb:
                        tmpTokens.append(self.gloveEmb[cleanedText])
                        found = True
                    elif len(cur_t.stanTokens) > 0:
                        #print("b:", self.getBestStanToken(cur_t.stanTokens).text.lower())
                        cleanedStan = self.removeQuotes(
                            self.getBestStanToken(
                                cur_t.stanTokens).text.lower())
                        if cleanedStan in self.gloveEmb:
                            tmpTokens.append(self.gloveEmb[cleanedStan])
                            found = True
                    if not found:
                        print("WARNING: we don't have prevToken:", cleanedText,
                              "or", cur_t.stanTokens)
                        print("token:", cur_t, "stans:", cur_t.stanTokens)
                        randEmb = []
                        for i in range(300):
                            randEmb.append(1)
                        tmpTokens.append([0] * 300)
                else:
                    tmpTokens.append([0] * 300)

            # N tokens after
            for i in range(self.bowWindow):
                ind = t_endIndex + 1 + i
                cur_t = self.corpus.corpusTokens[ind]
                found = False
                if ind < self.corpus.numCorpusTokens - 1 and cur_t.doc_id == doc_id:
                    #print("c:", cur_t.text)
                    cleanedText = self.removeQuotes(cur_t.text)
                    if cleanedText in self.gloveEmb:
                        tmpTokens.append(self.gloveEmb[cleanedText])
                        found = True
                    elif len(cur_t.stanTokens) > 0:
                        #print("d:", self.getBestStanToken(cur_t.stanTokens).text.lower())
                        cleanedStan = self.removeQuotes(
                            self.getBestStanToken(
                                cur_t.stanTokens).text.lower())
                        if cleanedStan in self.gloveEmb:
                            tmpTokens.append(self.gloveEmb[cleanedStan])
                            found = True
                    if not found:
                        print("WARNING: we don't have nextToken:", cleanedText,
                              "or", cur_t.stanTokens)
                        print("token:", cur_t, "stans:", cur_t.stanTokens)
                        tmpTokens.append([0] * 300)
                else:
                    tmpTokens.append([0] * 300)
            #uidToVector[self.corpus.XUIDToMention[xuid].UID] = tmpTokens
            flatvector = [item for sublist in tmpTokens for item in sublist]
            #print(flatvector)
            feature.setSingle(self.corpus.XUIDToMention[xuid].UID, flatvector)

        if self.saveRelationalFeatures:
            proc = 0
            completed = set()
            for xuid1, xuid2 in xuidPairs:
                uid1, uid2 = sorted([
                    self.corpus.XUIDToMention[xuid1].UID,
                    self.corpus.XUIDToMention[xuid2].UID
                ])
                if (uid1, uid2) in completed or (uid2, uid1) in completed:
                    continue
                completed.add((uid1, uid2))
                flatv1 = feature.singles[uid1]
                flatv2 = feature.singles[uid2]

                (dp, cs) = self.getDPCS(flatv1, flatv2)
                feature.addRelational(uid1, uid2, dp)
                feature.addRelational(uid1, uid2, cs)
                if proc % 1000 == 0:
                    print("\tprocessed",
                          proc,
                          "of",
                          len(xuidPairs),
                          "(%2.2f)" % float(100.0 * proc / len(xuidPairs)),
                          end="\r")
                proc += 1

        pickle_out = open(fileOut, 'wb')
        pickle.dump(feature, pickle_out)
예제 #5
0
    def saveDependencyFeatures(self, fileOut):
        feature = Feature()
        if len(self.gloveEmb) == 0:
            self.loadGloveEmbeddings()
        xuidPairs = self.getAllXUIDPairs()
        xuids = set()
        for (xuid1, xuid2) in xuidPairs:
            xuids.add(xuid1)
            xuids.add(xuid2)
        for xuid in xuids:
            sumParentEmb = [0] * 300
            sumChildrenEmb = [0] * 300
            numParentFound = 0
            tmpParentLemmas = []
            numChildrenFound = 0
            tmpChildrenLemmas = []
            for t in self.corpus.XUIDToMention[xuid].tokens:
                bestStanToken = self.getBestStanToken(t.stanTokens)
                print("bestStanToken:", bestStanToken)
                if len(bestStanToken.parentLinks[
                        self.dependency_parse_type]) == 0:
                    print("* token has no dependency parent!")
                    exit(1)
                for stanParentLink in bestStanToken.parentLinks[
                        self.dependency_parse_type]:
                    print("stanParentLink:", stanParentLink)
                    parentLemma = self.removeQuotes(
                        stanParentLink.parent.lemma.lower())
                    curEmb = [0] * 300

                    # TMP: just to see which texts we are missing
                    tmpParentLemmas.append(parentLemma)

                    if parentLemma == "ROOT":
                        curEmb = [1] * 300
                    elif parentLemma in self.gloveEmb:
                        curEmb = self.gloveEmb[parentLemma]
                        numParentFound += 1
                    sumParentEmb = [
                        x + y for x, y in zip(sumParentEmb, curEmb)
                    ]

                # makes embedding for the dependency children's lemmas
                #if len(bestStanToken.childLinks) == 0:
                #	print("* token has no dependency children!")
                for stanChildLink in bestStanToken.childLinks[
                        self.dependency_parse_type]:
                    childLemma = self.removeQuotes(
                        stanChildLink.child.lemma.lower())
                    curEmb = [0] * 300

                    # TMP: just to see which texts we are missing
                    tmpChildrenLemmas.append(childLemma)

                    if childLemma == "ROOT":
                        curEmb = [1] * 300
                    elif childLemma in self.gloveEmb:
                        curEmb = self.gloveEmb[childLemma]
                        numChildrenFound += 1
                    sumChildrenEmb = [
                        x + y for x, y in zip(sumChildrenEmb, curEmb)
                    ]
            parentEmb = sumParentEmb  # makes parent emb
            childrenEmb = sumChildrenEmb  # makes chid emb
            feature.setSingle(self.corpus.XUIDToMention[xuid].UID,
                              parentEmb + childrenEmb)

        # go through all pairs to compute relational data
        if self.saveRelationalFeatures:
            proc = 0
            completed = set()
            for xuid1, xuid2 in xuidPairs:
                uid1, uid2 = sorted([
                    self.corpus.XUIDToMention[xuid1].UID,
                    self.corpus.XUIDToMention[xuid2].UID
                ])
                if (uid1, uid2) in completed or (uid2, uid1) in completed:
                    continue
                completed.add((uid1, uid2))
                flatv1 = feature.singles[uid1]
                flatv2 = feature.singles[uid2]

                (dp, cs) = self.getDPCS(flatv1, flatv2)
                feature.addRelational(uid1, uid2, dp)
                feature.addRelational(uid1, uid2, cs)
                if proc % 1000 == 0:
                    print("\tprocessed",
                          proc,
                          "of",
                          len(xuidPairs),
                          "(%2.2f)" % float(100.0 * proc / len(xuidPairs)),
                          end="\r")
                proc += 1
        pickle_out = open(fileOut, 'wb')
        pickle.dump(feature, pickle_out)
예제 #6
0
    def savePOSFeatures(self, fileOut):
        feature = Feature()

        posLength = 50
        if len(self.posEmb) == 0:
            self.loadPOSEmbeddings()
        xuidPairs = self.getAllXUIDPairs()
        xuids = set()
        for (xuid1, xuid2) in xuidPairs:
            xuids.add(xuid1)
            xuids.add(xuid2)
        for xuid in xuids:
            sumEmb = [0] * posLength
            for t in self.corpus.XUIDToMention[xuid].tokens:
                pos = ""
                posOfLongestToken = ""
                longestToken = ""
                for stanToken in t.stanTokens:
                    if stanToken.pos in self.badPOS:
                        # only use the badPOS if no others have been set
                        if pos == "":
                            pos = stanToken.pos
                    else:  # save the longest, nonBad POS tag
                        if len(stanToken.text) > len(longestToken):
                            longestToken = stanToken.text
                            posOfLongestToken = stanToken.pos

                if posOfLongestToken != "":
                    pos = posOfLongestToken
                if pos == "":
                    print("* ERROR: our POS empty!")
                    exit(1)

                curEmb = self.posEmb[pos]
                sumEmb = [x + y for x, y in zip(sumEmb, curEmb)]
            feature.setSingle(self.corpus.XUIDToMention[xuid].UID, sumEmb)

        # go through all pairs to compute relational data
        if self.saveRelationalFeatures:
            completed = set()
            proc = 0
            for xuid1, xuid2 in xuidPairs:
                uid1, uid2 = sorted([
                    self.corpus.XUIDToMention[xuid1].UID,
                    self.corpus.XUIDToMention[xuid2].UID
                ])
                if (uid1, uid2) in completed or (uid2, uid1) in completed:
                    continue
                completed.add((uid1, uid2))
                flatv1 = feature.singles[uid1]
                flatv2 = feature.singles[uid2]

                (dp, cs) = self.getDPCS(flatv1, flatv2)
                feature.addRelational(uid1, uid2, dp)
                feature.addRelational(uid1, uid2, cs)
                if proc % 1000 == 0:
                    print("\tprocessed",
                          proc,
                          "of",
                          len(xuidPairs),
                          "(%2.2f)" % float(100.0 * proc / len(xuidPairs)),
                          end="\r")
                proc += 1
        pickle_out = open(fileOut, 'wb')
        pickle.dump(feature, pickle_out)