def on_data(self, data) : global count global interesting_count global int_german global int_russian global tw_writer global _TOTAL_LIMIT global _INTERESTING_LIMIT global _STRICT_INTERESTING_LIMIT_LOWER global init_by tweet = customTweet(data) if tweet.is_lang_interesting() and tweet.is_term_interesting() and tweet.is_original(): if tweet.is_lang_german(): int_german = int_german + 1 if tweet.is_lang_russian(): int_russian = int_russian + 1 interesting_count+=1 print("Got a new tweet :: Total # : "+ str(int_german)+"-"+str(int_russian)+"|"+str(interesting_count-int_german-int_russian)+"/"+str(count)) if tweet.is_lang_german() : tw_writer.dump_tweet(data,'de') elif tweet.is_lang_russian() : tw_writer.dump_tweet(data,'ru') elif tweet.is_lang_english() : tw_writer.dump_tweet(data,'en') if (interesting_count <= _INTERESTING_LIMIT and count <= _TOTAL_LIMIT) or interesting_count <= _STRICT_INTERESTING_LIMIT_LOWER : '''try: #req = requests.post(update_url[count%2]+update_url_args[0 if count%25==0 else 1], data = tweet.encode_to_json(), headers=headers) pass except Exception: logger.log("Solr offline. Attempting wake") p = subprocess.Popen(str("/home/anudeep3998/cse535/solr/solr-5.3.0/bin/solr start -e cloud -noprompt"), stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() if err : logger.log("Couldn't wake solr. Terminating.") sys.exit(0) else : logger.log("solr wake successful. Continuing..") ''' pass #print(req.text) #print("Pushing to SOLR : return# "+str(req.status_code)) else: ''' commit both cores. One duplicate tweet will be added to one of the core, but shouldn't matter over the other count ''' #req = requests.post(update_url[1]+update_url_args[0], data = tweet.encode_to_json(), headers=headers) #req = requests.post(update_url[0]+update_url_args[0], data = tweet.encode_to_json(), headers=headers) msg = "["+init_by+"] Successfully completed dump :: Total # : G["+ str(int_german)+"]-R["+str(int_russian)+"] | E["+str(interesting_count-int_german-int_russian)+"] / T["+str(count)+"]" print(msg) logger.end(msg) sys.exit(0) elif not tweet.is_original(): print("retweet/quoted tweet. Scanned["+str(count)+"]") else: print("Unkown or uninteresting language/term, skipping. Scanned["+str(count)+"]") #print("Got a new tweet :: "+parsed_text['text'].encode('ascii', 'ignore').decode('ascii')+"\nTotal # : "+ str(count)) count = count + 1 #terminate after limit if count > _TOTAL_LIMIT and interesting_count > _STRICT_INTERESTING_LIMIT_LOWER : msg = "["+init_by+"] Successfully completed dump :: Total # : G["+ str(int_german)+"]-R["+str(int_russian)+"] | E["+str(interesting_count-int_german-int_russian)+"] / T["+str(count)+"]" logger.end(msg) print(msg) sys.exit(0) return True
to_file_path = [_static_path+'tweetDump/cus_eng_encoded3.json', _static_path+'tweetDump/cus_ger_encoded3.json', _static_path+'tweetDump/cus_rus_encoded3.json'] custom_header = '[' custom_tail = ']' count = [0,0,0] illegal = 0 json_writer = fileWriter(to_file_path,custom_header,custom_tail) t=0 for fr in from_file_path: print("Dumping data from : "+fr) with open (fr,"r",encoding='utf-8') as f: for line in f: if len(line) > 10 : try: tweet = customTweet(parse.unquote_plus(str(line))) json_writer.dump_tweet(tweet.encode_to_json2(), tweet.lang) if tweet.is_lang_english(): count[0] = count[0]+1 elif tweet.is_lang_german(): count[1] = count[1]+1 elif tweet.is_lang_russian(): count[2] = count[2]+1 except Exception as e: print("failed to dump :: "+line[:10] +" Exception : "+str(e)) illegal+=1 #print("Found an illegal entry") t=0 for f in from_file_path: