Ejemplo n.º 1
0
def analyze(input, output):
    with open(input, 'r') as in_f, open(output, 'a') as out_f:
        for line in in_f:
            in_data = json.loads(line)
            sent = interface.predictTweet(in_data['text'])
            in_data['sentiment'] = sent['pos']
            json.dump(in_data, out_f)
            out_f.write('\n')
Ejemplo n.º 2
0
def analyze(input, output):
    with open(input, 'r') as in_f, open(output, 'a') as out_f:
        for line in in_f:
            in_data = json.loads(line)
            sent = interface.predictTweet(in_data['text'])
            in_data['sentiment'] = sent['pos']
            json.dump(in_data, out_f)
            out_f.write('\n')
def clean_str(str):
    import re
    str = str + " "
    str = re.sub("http[^ ]*[\\\]", "\\\\", str)  #Remove hyperlinks
    str = re.sub("http[^ ]* ", " ", str)  #Remove hyperlinks
    str = str.replace('\\n', ' ')
    arr = re.findall(
        r"\w+(?:[-']\w+)*|'|[:)-.(]+|\S\w*",
        str)  #Single punctuation mark is removed, smileys remain intact
    arr = [i for i in arr if len(i) > 1 and i[0] != '@'
           ]  #Remove words starting with @ (Twitter mentions)
    arr = [i if i[0] != '#' else i[1:] for i in arr]  #Remove '#' from hashtags
    #arr=[i for i in arr if i!='http' and i!='com' and i!='org']
    res = " ".join(arr)
    return res.lower().strip()


fp, out = sys.argv[1].split(',')

sc = pyspark_cassandra.CassandraSparkContext()

data = sc.textFile(fp, 36)

clean_text = data.map(json.loads) \
                 .map(lambda x: (x, clean_str(x['text'])))

json_preds = clean_text.map(lambda x: (x[0], predictTweet(x[1])['pos'])) \
                       .map(json.dumps)

json_preds.saveAsTextFile(out)
Ejemplo n.º 4
0
import interface

print interface.predictTweet("I hate you")
print interface.predictList(["I hate you", "I feel ambivalent about you.", "Trump in General", "Violence", "I love everything"])
Ejemplo n.º 5
0
if __name__ == 'main':

brokers, topic = sys.argv[1:]

sc = pyspark_cassandra.CassandraSparkContext()
ssc = StreamingContext(sc, 1)

kvs = KafkaUtils.createDirectStream(ssc,
                                    [topic],
                                    {"metadata.broker.list": brokers})

clean_text = kvs.map(lambda x: json.loads(x[1])) \
                .map(lambda x: (x, clean_str(x['text'])))

db_dict = clean_text.map(lambda x: db_dict(x[0], predictTweet(x[1])['pos']))

trump = db_dict.filter(lambda x: x['candidate'] == 'trump')
hillary = db_dict.filter(lambda x: x['candidate'] == 'hillary')
bernie = db_dict.filter(lambda x: x['candidate'] == 'bernie')
zodiac_killer = db_dict.filter(lambda x: x['candidate'] == 'cruz')
parties = db_dict.filter(lambda x: x['candidate'] == 'parties')

if not trump.isEmpty():
    trump.saveToCassandra('db', 'trump')
if not hillary.isEmpty():
    hillary.saveToCassandra('db', 'hillary')
if not bernie.isEmpty():
    bernie.saveToCassandra('db', 'bernie')
if not zodiac_killer.isEmpty():
    zodiac_killer.saveToCassandra('db', 'cruz')
Ejemplo n.º 6
0
import json
import sys
import pyspark_cassandra

def clean_str(str):
    import re
    str=str+" "
    str=re.sub("http[^ ]*[\\\]","\\\\",str)                    #Remove hyperlinks
    str=re.sub("http[^ ]* "," ",str)                           #Remove hyperlinks
    str=str.replace('\\n',' ')
    arr=re.findall(r"\w+(?:[-']\w+)*|'|[:)-.(]+|\S\w*", str)   #Single punctuation mark is removed, smileys remain intact
    arr=[i for i in arr if len(i)>1 and i[0]!='@']             #Remove words starting with @ (Twitter mentions)
    arr=[i if i[0]!='#' else i[1:] for i in arr]               #Remove '#' from hashtags
    #arr=[i for i in arr if i!='http' and i!='com' and i!='org']
    res=" ".join(arr)
    return res.lower().strip()

fp, out = sys.argv[1].split(',')

sc = pyspark_cassandra.CassandraSparkContext()

data = sc.textFile(fp, 36)

clean_text = data.map(json.loads) \
                 .map(lambda x: (x, clean_str(x['text'])))

json_preds = clean_text.map(lambda x: (x[0], predictTweet(x[1])['pos'])) \
                       .map(json.dumps)

json_preds.saveAsTextFile(out)