Esempio n. 1
0
    def __iter__(self):
        for fname in os.listdir(self.dirname):

            fname = os.path.join(self.dirname, fname)
            if not os.path.isfile(fname):
                continue
            for line in utils.smart_open(fname):
                self.i += 1
                if self.i == 1:
                    tw = line.lower().strip()
                if self.i == 2:
                    target = line.lower().strip()
                if self.i == 3:
                    senti = int(line.strip()) + 2
                    tw = tw.replace(
                        '$t$', target
                    )  #using already preprocessed data from Tang et al. 2016
                    tw = tw.replace(target, ' ' + target + ' ')
                    tw = tw.replace(''.join(target.split()),
                                    ' ' + '_'.join(target.split()) + ' ')
                    tw = tw.replace(target,
                                    ' ' + '_'.join(target.split()) + ' ')
                    tweet = tokenize(tw)
                    yield (tweet, '_'.join(target.split()), senti)
                    self.i = 0
Esempio n. 2
0
 def __iter__(self):
     with open(self.fname,'rb') as f:
         for line in f:
             parts = line.strip().lower().split()
             if len(parts)>=2:
                 y = parts[0]
                 tw=' '.join(parts[1:])
                 x = tokenize(tw.decode('latin1')) # decoding in Latin1 for Arabic
                 x=u' '.join(x)
                 yield x,y
Esempio n. 3
0
 def __iter__(self):
     with open(self.fname, 'rb') as f:
         for line in f:
             parts = line.strip().lower().split()
             if len(parts) >= 2:
                 y = parts[0]
                 tw = ' '.join(parts[1:])
                 x = tokenize(
                     tw.decode('latin1'))  # decoding in Latin1 for Arabic
                 x = u' '.join(x)
                 yield x, y
Esempio n. 4
0
 def __iter__(self):
     for fname in os.listdir(self.dirname):
         fname = os.path.join(self.dirname, fname)
         if not os.path.isfile(fname):
             continue
         for line in utils.smart_open(fname):
             line = line.split('\t')
             id = line[1]
             if line[2] == 'positive':
                 sent = 1
             elif line[2] == 'negative':
                 sent = -1
             elif line[2] == 'neutral':
                 sent = 0
             senti = sent + 1
             target = line[3].lower().strip()
             location = line[4]
             tw = line[-1].lower().strip()
             tw = fix_text(tw.decode('utf-8'))
             range = []
             p = re.compile(r'(?<!\w)({0})(?!\w)'.format(target))
             for m in p.finditer(tw.lower()):
                 range.append([m.start(), m.start() + len(m.group())])
             if location != 'nan':
                 cc = 0
                 for a, b in enumerate(range):
                     if b[0] - 1 <= int(location) <= b[1] + 4:
                         wh = a
                         cc = 1
                 if cc == 0:
                     wh = 'nan'
             else:
                 wh = location
             if wh == 'nan':
                 tw = tw.replace(target, ' ' + target + ' ')
                 tw = tw.replace(''.join(target.split()),
                                 ' ' + '_'.join(target.split()) + ' ')
                 tw = tw.replace(target,
                                 ' ' + '_'.join(target.split()) + ' ')
             else:
                 try:
                     r = range[wh]
                 except:
                     print "Error at processing election data; at line 85 process_data.py!"
                 tw = tw[:r[0]] + tw[r[0]:r[1] + 2].replace(
                     target, ' ' + target + ' ') + tw[r[1] + 2:]
                 tw = tw[:r[0]] + tw[r[0]:r[1] + 4].replace(
                     ''.join(target.split()),
                     ' ' + '_'.join(target.split()) + ' ') + tw[r[1] + 4:]
                 tw = tw[:r[0]] + tw[r[0]:r[1] + 6].replace(
                     target,
                     ' ' + '_'.join(target.split()) + ' ') + tw[r[1] + 6:]
             tweet = tokenize(tw)
             yield (tweet, '_'.join(target.split()), senti, id, wh)
Esempio n. 5
0
def tok_and_replace(sen, en):
    tw = sen.lower()
    target = en.lower()

    tw = tw.replace(
        '$t$', target)  #using already preprocessed data from Tang et al. 2016
    tw = tw.replace(target, ' ' + target + ' ')
    tw = tw.replace(''.join(target.split()),
                    ' ' + '_'.join(target.split()) + ' ')
    tw = tw.replace(target, ' ' + '_'.join(target.split()) + ' ')
    tweet = tokenize(tw)
    targetEn = '_'.join(target.split())
    assert (targetEn in tweet)
    return tweet, targetEn
Esempio n. 6
0
    def __iter__(self):
        for fname in os.listdir(self.dirname):

            fname = os.path.join(self.dirname, fname)
            if not os.path.isfile(fname):
                continue
            for line in utils.smart_open(fname):
                self.i+=1
                if self.i==1:
                    tw=line.lower().strip()
                if self.i==2:
                    target=line.lower().strip()
                if self.i==3:
                    senti=int(line.strip())+1
                    tw=tw.replace(target,' '+target+' ')
                    tw=tw.replace(''.join(target.split()),' '+'_'.join(target.split())+' ')
                    tw=tw.replace(target,' '+'_'.join(target.split())+' ')
                    tweet=tokenize(tw)
#                    tweetpro=twprocess(tweet).tweet                    
                    yield (tweet,'_'.join(target.split()),senti)
                    self.i=0
Esempio n. 7
0
from keras.utils import to_categorical
from xml.dom import minidom

from dlblocks import text
from dlblocks.pyutils import mapArrays, loadJson, saveJson, selectKeys, oneHotVec, padList
from dlblocks.pyutils import int64Arr, floatArr

sents = {"N": -1, "P": 1, "NONE": 0}

data = open("./data_cm_senti/cs-corpus-with-tweets_train.txt").read().split(
    "\n")
data = map(lambda x: x.split("\t"), data)
data = map(
    lambda x: {
        'sentiment': sents[x[1]],
        'tokens': tokenize(x[2]),
        'text': x[2]
    }, data)
en_es_wssa_data_train = data

data = open("./data_cm_senti/cs-corpus-with-tweets_test.txt").read().split(
    "\n")
data = map(lambda x: x.split("\t"), data)
data = map(
    lambda x: {
        'sentiment': sents[x[1]],
        'tokens': tokenize(x[2]),
        'text': x[2]
    }, data)
en_es_wssa_data_test = data