Ejemplo n.º 1
0
 def on_data(self, data) :
     global count
     global interesting_count
     global int_german
     global int_russian
     global tw_writer
     global _TOTAL_LIMIT
     global _INTERESTING_LIMIT
     global _STRICT_INTERESTING_LIMIT_LOWER
     global init_by
     
     tweet = customTweet(data)
     
     
     if tweet.is_lang_interesting() and tweet.is_term_interesting() and tweet.is_original():
         if tweet.is_lang_german():
             int_german = int_german + 1
         if tweet.is_lang_russian():
             int_russian = int_russian + 1
             
         interesting_count+=1
         print("Got a new tweet :: Total # : "+ str(int_german)+"-"+str(int_russian)+"|"+str(interesting_count-int_german-int_russian)+"/"+str(count))
         
         if tweet.is_lang_german() :
             tw_writer.dump_tweet(data,'de')
         elif tweet.is_lang_russian() :
             tw_writer.dump_tweet(data,'ru')
         elif tweet.is_lang_english() :
             tw_writer.dump_tweet(data,'en')
         
         
         if (interesting_count <= _INTERESTING_LIMIT and count <= _TOTAL_LIMIT) or interesting_count <= _STRICT_INTERESTING_LIMIT_LOWER :
             '''try:
                 #req = requests.post(update_url[count%2]+update_url_args[0 if count%25==0 else 1], data = tweet.encode_to_json(), headers=headers)
                 pass
             except Exception:
                     logger.log("Solr offline. Attempting wake")
                     p = subprocess.Popen(str("/home/anudeep3998/cse535/solr/solr-5.3.0/bin/solr start -e cloud -noprompt"), stdout=subprocess.PIPE, shell=True)
                     (output, err) = p.communicate()
                     if err :
                         logger.log("Couldn't wake solr. Terminating.")
                         sys.exit(0)
                     else :
                         logger.log("solr wake successful. Continuing..")
             '''
             pass
             #print(req.text)
             #print("Pushing to SOLR : return# "+str(req.status_code))
         else:
             '''
                 commit both cores. One duplicate tweet will be added to one of the core, but shouldn't matter over the other count
             '''
             #req = requests.post(update_url[1]+update_url_args[0], data = tweet.encode_to_json(), headers=headers)
             #req = requests.post(update_url[0]+update_url_args[0], data = tweet.encode_to_json(), headers=headers)
             msg = "["+init_by+"] Successfully completed dump :: Total # : G["+ str(int_german)+"]-R["+str(int_russian)+"] | E["+str(interesting_count-int_german-int_russian)+"] / T["+str(count)+"]"
             print(msg)
             logger.end(msg)
             sys.exit(0)
     elif not tweet.is_original():
         print("retweet/quoted tweet. Scanned["+str(count)+"]")
     else:
         print("Unkown or uninteresting language/term, skipping. Scanned["+str(count)+"]")
     #print("Got a new tweet :: "+parsed_text['text'].encode('ascii', 'ignore').decode('ascii')+"\nTotal # : "+ str(count))
     count = count + 1
     
     #terminate after limit
     if count > _TOTAL_LIMIT and interesting_count > _STRICT_INTERESTING_LIMIT_LOWER :
         msg = "["+init_by+"] Successfully completed dump :: Total # : G["+ str(int_german)+"]-R["+str(int_russian)+"] | E["+str(interesting_count-int_german-int_russian)+"] / T["+str(count)+"]"
         logger.end(msg)
         print(msg)
         sys.exit(0)
     
     return True
Ejemplo n.º 2
0
to_file_path = [_static_path+'tweetDump/cus_eng_encoded3.json', _static_path+'tweetDump/cus_ger_encoded3.json', _static_path+'tweetDump/cus_rus_encoded3.json']
custom_header = '['
custom_tail = ']'

count = [0,0,0]
illegal = 0

json_writer = fileWriter(to_file_path,custom_header,custom_tail)
t=0
for fr in from_file_path:
    print("Dumping data from : "+fr)
    with open (fr,"r",encoding='utf-8') as f:
        for line in  f:
            if len(line) > 10 :
                try:
                    tweet = customTweet(parse.unquote_plus(str(line)))
                    json_writer.dump_tweet(tweet.encode_to_json2(), tweet.lang)
                    if tweet.is_lang_english():
                        count[0] = count[0]+1
                    elif tweet.is_lang_german():
                        count[1] = count[1]+1
                    elif tweet.is_lang_russian():
                        count[2] = count[2]+1

                except Exception as e:
                    print("failed to dump :: "+line[:10] +" Exception : "+str(e))
                    illegal+=1
                    #print("Found an illegal entry")

t=0
for f in from_file_path: