def __iter__(self): for fname in os.listdir(self.dirname): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue for line in utils.smart_open(fname): self.i += 1 if self.i == 1: tw = line.lower().strip() if self.i == 2: target = line.lower().strip() if self.i == 3: senti = int(line.strip()) + 2 tw = tw.replace( '$t$', target ) #using already preprocessed data from Tang et al. 2016 tw = tw.replace(target, ' ' + target + ' ') tw = tw.replace(''.join(target.split()), ' ' + '_'.join(target.split()) + ' ') tw = tw.replace(target, ' ' + '_'.join(target.split()) + ' ') tweet = tokenize(tw) yield (tweet, '_'.join(target.split()), senti) self.i = 0
def __iter__(self): with open(self.fname,'rb') as f: for line in f: parts = line.strip().lower().split() if len(parts)>=2: y = parts[0] tw=' '.join(parts[1:]) x = tokenize(tw.decode('latin1')) # decoding in Latin1 for Arabic x=u' '.join(x) yield x,y
def __iter__(self): with open(self.fname, 'rb') as f: for line in f: parts = line.strip().lower().split() if len(parts) >= 2: y = parts[0] tw = ' '.join(parts[1:]) x = tokenize( tw.decode('latin1')) # decoding in Latin1 for Arabic x = u' '.join(x) yield x, y
def __iter__(self): for fname in os.listdir(self.dirname): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue for line in utils.smart_open(fname): line = line.split('\t') id = line[1] if line[2] == 'positive': sent = 1 elif line[2] == 'negative': sent = -1 elif line[2] == 'neutral': sent = 0 senti = sent + 1 target = line[3].lower().strip() location = line[4] tw = line[-1].lower().strip() tw = fix_text(tw.decode('utf-8')) range = [] p = re.compile(r'(?<!\w)({0})(?!\w)'.format(target)) for m in p.finditer(tw.lower()): range.append([m.start(), m.start() + len(m.group())]) if location != 'nan': cc = 0 for a, b in enumerate(range): if b[0] - 1 <= int(location) <= b[1] + 4: wh = a cc = 1 if cc == 0: wh = 'nan' else: wh = location if wh == 'nan': tw = tw.replace(target, ' ' + target + ' ') tw = tw.replace(''.join(target.split()), ' ' + '_'.join(target.split()) + ' ') tw = tw.replace(target, ' ' + '_'.join(target.split()) + ' ') else: try: r = range[wh] except: print "Error at processing election data; at line 85 process_data.py!" tw = tw[:r[0]] + tw[r[0]:r[1] + 2].replace( target, ' ' + target + ' ') + tw[r[1] + 2:] tw = tw[:r[0]] + tw[r[0]:r[1] + 4].replace( ''.join(target.split()), ' ' + '_'.join(target.split()) + ' ') + tw[r[1] + 4:] tw = tw[:r[0]] + tw[r[0]:r[1] + 6].replace( target, ' ' + '_'.join(target.split()) + ' ') + tw[r[1] + 6:] tweet = tokenize(tw) yield (tweet, '_'.join(target.split()), senti, id, wh)
def tok_and_replace(sen, en): tw = sen.lower() target = en.lower() tw = tw.replace( '$t$', target) #using already preprocessed data from Tang et al. 2016 tw = tw.replace(target, ' ' + target + ' ') tw = tw.replace(''.join(target.split()), ' ' + '_'.join(target.split()) + ' ') tw = tw.replace(target, ' ' + '_'.join(target.split()) + ' ') tweet = tokenize(tw) targetEn = '_'.join(target.split()) assert (targetEn in tweet) return tweet, targetEn
def __iter__(self): for fname in os.listdir(self.dirname): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue for line in utils.smart_open(fname): self.i+=1 if self.i==1: tw=line.lower().strip() if self.i==2: target=line.lower().strip() if self.i==3: senti=int(line.strip())+1 tw=tw.replace(target,' '+target+' ') tw=tw.replace(''.join(target.split()),' '+'_'.join(target.split())+' ') tw=tw.replace(target,' '+'_'.join(target.split())+' ') tweet=tokenize(tw) # tweetpro=twprocess(tweet).tweet yield (tweet,'_'.join(target.split()),senti) self.i=0
from keras.utils import to_categorical from xml.dom import minidom from dlblocks import text from dlblocks.pyutils import mapArrays, loadJson, saveJson, selectKeys, oneHotVec, padList from dlblocks.pyutils import int64Arr, floatArr sents = {"N": -1, "P": 1, "NONE": 0} data = open("./data_cm_senti/cs-corpus-with-tweets_train.txt").read().split( "\n") data = map(lambda x: x.split("\t"), data) data = map( lambda x: { 'sentiment': sents[x[1]], 'tokens': tokenize(x[2]), 'text': x[2] }, data) en_es_wssa_data_train = data data = open("./data_cm_senti/cs-corpus-with-tweets_test.txt").read().split( "\n") data = map(lambda x: x.split("\t"), data) data = map( lambda x: { 'sentiment': sents[x[1]], 'tokens': tokenize(x[2]), 'text': x[2] }, data) en_es_wssa_data_test = data