def loadSentences(self, identifier, sentlist): ## sentlist should be a list of sentence strings, tokenized; ## identifier is a string serving as the header of this sentlst self.test = [] self.fileid = identifier for i,sent in enumerate(sentlist): self.test.append(Instance(identifier+"."+str(i),0,features.RawSent(sent)))
def numEmoji(self): df = pd.DataFrame() recs = [features.RawSent(r) for r in self.test] df["numsymbols"] = features.numSymbols(recs, normalize=True) df["numemoji"] = features.countEmoji(recs, normalize=True) df.to_csv("symbol&emoji.csv", encoding="utf-8")
def fShallow(self): normalize = True recs = [features.RawSent(r) for r in self.test] self._add_feature("avgwordlen", features.avgWordLen(recs)) self._add_feature("sentlen", features.sentLen(recs)) self._add_feature("numsymbols", features.numSymbols(recs, normalize)) self._add_feature("numcapltrs", features.numCapLetters(recs, normalize)) self._add_feature("numnumbers", features.numNumbers(recs, normalize))
def NE_Concrete(self): sentlst = [features.RawSent(r) for r in self.test] pos_tag = features.NE_Concrete_Emo(sentlst) Useful_Tag = [ 'ORGANIZATION', "PERCENT", 'PERSON', 'DATE', 'MONEY', 'TIME', 'LOCATION', 'Concrete' ] for i in Useful_Tag: self._add_feature(i, pos_tag.loc[:, i])
def loadFromFile(self,filename): self.test = [] self.fileid = os.path.basename(filename) i = 0 with open(filename) as f: for line in f: if len(line.strip()) == 0: continue self.test.append(Instance(self.fileid+"."+str(i),0,features.RawSent(line.strip()))) i += 1 f.close()
def transEmotionFeature(self): self.transformEmoji() try: f = pd.read_csv("NE_Concrete_Emo.csv") self._add_feature("Negative", f.loc[:, 'Negative']) self._add_feature("Positive", f.loc[:, 'Positive']) except IOError: sentlst = [features.RawSent(r) for r in self.test] file = features.NE_Concrete_Emo(sentlst) self._add_feature("Negative", file.loc[:, 'Negative']) self._add_feature("Positive", file.loc[:, 'Positive'])
def fNeuralVec(self): sentlst = [features.RawSent(r) for r in self.test] keys = ["word_embed-" + str(i) for i in range(100)] if keys[0] not in self.featurestest: embeddingList = features.word_2_weights(sentlst, self.embeddings) for fid, fname in enumerate(keys): self.featurestest[fname] = [ embeddingList[j][fid] for j in range(len(embeddingList)) ] print("Successfully generate word_embdding features")
def fBrownCluster_100(self): sentlst = [features.RawSent(r) for r in self.test] keys = ["brnclst_100-" + str(i) for i in range(100)] if keys[0] not in self.featurestest: print("Start initialize Browncluster ....") brownClus, cluster_2_index = utils.readMetaOptimizeBrownCluster_100( ) print("finished generating brownClusterlist !") self.brnclst = brownClus brownClusterList = features.brownCluster(sentlst, brownClus, cluster_2_index, 100) for fid, fname in enumerate(keys): self.featurestest[fname] = [ brownClusterList[j][fid] for j in range(len(brownClusterList)) ]
def transformEmoji(self): recs = [features.RawSent(r) for r in self.test] self._add_feature("numemoji", features.countEmoji(recs, normalize=True))
def fPostag(self): sentlst = [features.RawSent(r) for r in self.test] pos_tag = features.extractPOS(sentlst) Useful_Tag = ['DT', 'NN', "VB", 'JJ', 'IN', '.', 'PRP', 'NNP', 'WP'] for i in Useful_Tag: self._add_feature(i, pos_tag.loc[:, i])