def bulkJsonData(json_file, _index, whatStuff): json_list = c.getDataFromFile(json_file) for doc in json_list: # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in doc: json_doc = json.loads(doc) if whatStuff == "Email Addresses": my_text = json_doc["Email Address"] json_doc.update([("term", my_text)]) json_doc.update([("key", "email")]) if whatStuff == "PhoneNumbers": my_text2 = json_doc["Number"] json_doc.update([("term", my_text2)]) json_doc.update([("key", "phone")]) if whatStuff == "Profile_fixed": my_text = json_doc["term"] clean_my_text = c.cleanText(my_text) json_doc.update([("term", clean_my_text)]) my_text2 = json_doc["key"] clean_my_text2 = c.cleanText(my_text2) json_doc.update([("key", clean_my_text2)]) # add load_type, used later for filter json_doc.update([("load_type", whatStuff)]) json_doc.update([("source_type", "linkedIn")]) new_doc = str(json_doc).replace("'", '"') yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
def bulkJsonData(json_file, _index, whatStuff): json_list = c.getDataFromFile(json_file) for doc in json_list: json_doc = json.loads(doc) # clean the text in comments and title from special character and emojies after json conversion for attachments in json_doc['attachments']: for dt in attachments['data']: for pl in dt['poll']['options']: my_text = pl["option"] clean_my_text = c.cleanText(my_text) pl.update([("option", clean_my_text)]) my_vote = str(pl["voted"]) clean_my_vote = c.cleanText(my_vote) pl.update([("voted", clean_my_vote)]) my_title = json_doc["title"] clean_my_title = c.cleanText(my_title) json_doc.update([("title", clean_my_title)]) # add load_type, used later for filter json_doc.update([("load_type", whatStuff)]) json_doc.update([("source_type", "facebook")]) new_doc = str(json_doc).replace("'", '"') # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in new_doc: yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
def bulkJsonData(json_file, _index, loadType): json_list = c.getDataFromFile(json_file) for doc in json_list: # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in doc: json_doc = json.loads(doc) # clean the text in comments and title from special character and emojies after json conversion if loadType == 'pages': my_text = json_doc["name"] clean_my_text = c.cleanText(my_text) json_doc.update([("name", clean_my_text)]) else: my_text = json_doc["title"] clean_my_text = c.cleanText(my_text) json_doc.update([("title", clean_my_text)]) # add load_type, used later for filter json_doc.update([("load_type", loadType)]) json_doc.update([("source_type", "facebook")]) new_doc = str(json_doc).replace("'", '"') #print (new_doc) yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
def bulkJsonData(json_file, _index,whatStuff): json_list = c.getDataFromFile(json_file) for doc in json_list: # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in doc: yield { "_index": _index, "_id": uuid.uuid4(), "_source": doc }
def bulkJsonData(json_file, _index, whatStuff): json_list = c.getDataFromFile(json_file) for doc in json_list: # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in doc: json_doc = json.loads(doc) # add load_type, used later for filter json_doc.update([("load_type", whatStuff)]) json_doc.update([("source_type", "twitter")]) new_doc = str(json_doc).replace("'", '"') #print (new_doc) yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
def bulkJsonData(json_file, _index, whatStuff): json_list = c.getDataFromFile(json_file) for doc in json_list: json_doc = json.loads(doc) sentiment = [0, 0, 0] # clean the text in comments and title from special character and emojies after json conversion if "data" in json_doc: my_text_location = json_doc["data"][0]["comment"] my_text = my_text_location["comment"] #get sentiment sentiment = s.getSentiment(my_text) clean_my_text = c.cleanText(my_text) my_text_location.update([("comment", clean_my_text)]) json_doc.update([("all_text", clean_my_text)]) if "group" in my_text_location: my_group = my_text_location["group"] clean_my_group = c.cleanText(my_group) my_text_location.update([("group", clean_my_group)]) my_title = json_doc["title"] clean_my_title = c.cleanText(my_title) json_doc.update([("title", clean_my_title)]) # add sentiment json_doc.update([("mySentiment", sentiment[0])]) json_doc.update([("sentPositive", sentiment[1])]) json_doc.update([("sentNegative", sentiment[2])]) # add load_type, used later for filter json_doc.update([("load_type", whatStuff)]) json_doc.update([("source_type", "facebook")]) new_doc = str(json_doc).replace("'", '"') # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in new_doc: yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}
def bulkJsonData(json_file, _index,whatStuff): json_list = c.getDataFromFile(json_file) for doc in json_list: # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in doc: json_doc = json.loads(doc) # clean the text in comments and title from special character and emojies after json conversion my_text = json_doc["First Name"] clean_my_text = c.cleanText(my_text) json_doc.update([ ("First Name", clean_my_text) ]) my_text2 = json_doc["Last Name"] clean_my_text2 = c.cleanText(my_text2) json_doc.update([ ("Last Name", clean_my_text2) ]) my_text3 = json_doc["Company"] clean_my_text3 = c.cleanText(my_text3) json_doc.update([ ("Company", clean_my_text3) ]) my_text4 = json_doc["Position"] clean_my_text4 = c.cleanText(my_text4) json_doc.update([ ("Position", clean_my_text4) ]) # add load_type, used later for filter json_doc.update([ ("load_type", whatStuff) ]) json_doc.update([ ("source_type", "linkedIn") ]) new_doc = str(json_doc).replace("'", '"') #print (new_doc) yield { "_index": _index, "_id": uuid.uuid4(), "_source": new_doc }
def bulkJsonData(json_file, _index, whatStuff): json_list = c.getDataFromFile(json_file) for doc in json_list: json_doc = json.loads(doc) sentiment=[0,0,0] # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in doc: # clean the text in comments and title from special character and emojies after json conversion if 'data' in json_doc: for dt in json_doc['data']: if 'post' in dt: my_text = dt["post"] #get sentiment sentiment = s.getSentiment(my_text) clean_my_text = c.cleanText(my_text) dt.update([ ("post", clean_my_text) ]) json_doc.update([ ("all_text", clean_my_text) ]) if 'attachments' in json_doc: for att in json_doc['attachments']: if 'data' in att: for dt in att['data']: if 'external_context' in dt: if 'name' in dt["external_context"]: my_text2 = dt["external_context"]["name"] clean_my_text2 = c.cleanText(my_text2) dt["external_context"].update([ ("name", clean_my_text2) ]) if 'media' in dt: my_title2 = dt['media']['title'] clean_my_title2 = c.cleanText(my_title2) dt['media'].update([ ("title", clean_my_title2) ]) if 'description' in dt['media']: my_description = dt['media']["description"] clean_my_description = c.cleanText(my_description) dt['media'].update([ ("description", clean_my_description) ]) if 'place' in dt: my_loc = dt["place"]["coordinate"] my_lat = my_loc["latitude"] my_lon = my_loc["longitude"] new_my_loc = [my_lon,my_lat] dt["place"].update([ ("location", new_my_loc) ]) if 'title' in json_doc: my_title = json_doc["title"] clean_my_title = c.cleanText(my_title) json_doc.update([ ("title", clean_my_title) ]) # add sentiment json_doc.update([ ("mySentiment", sentiment[0]) ]) json_doc.update([ ("sentPositive", sentiment[1]) ]) json_doc.update([ ("sentNegative", sentiment[2]) ]) # add load_type, used later for filter json_doc.update([ ("load_type", whatStuff) ]) json_doc.update([ ("source_type", "facebook") ]) new_doc = str(json_doc).replace("'", '"') yield { "_index": _index, "_id": uuid.uuid4(), "_source": new_doc }
def bulkJsonData(json_file, _index, whatStuff): json_list = c.getDataFromFile(json_file) for doc in json_list: # use a 'yield' generator so that the data isn't loaded into memory if '{"index"' not in doc: json_doc1 = json.loads(doc) sentiment = [0, 0, 0] if 'tweet' in json_doc1: json_doc = json_doc1["tweet"] else: json_doc = json_doc1 my_text = json_doc["full_text"] #get sentiment if not my_text.startswith("RT"): sentiment = s.getSentiment(my_text) clean_my_text = c.cleanText(my_text) json_doc.update([("full_text", clean_my_text)]) my_text2 = json_doc["source"] clean_my_text2 = c.cleanText(my_text2) json_doc.update([("source", clean_my_text2)]) # Does not like "False", needed to be "false" !!! #my_text1 = json_doc["retweeted"] #clean_my_text1 = c.cleanText(str(my_text1)) #json_doc.update([ ("retweeted", clean_my_text1) ]) #if 'truncated' in json_doc: # my_text3 = json_doc["truncated"] # clean_my_text3 = c.cleanText(str(my_text3)) # json_doc.update([ ("truncated", clean_my_text3) ]) #if 'favorited' in json_doc: # my_text4 = json_doc["favorited"] # clean_my_text4 = c.cleanText(str(my_text4)) # json_doc.update([ ("favorited", clean_my_text4) ]) #if 'possibly_sensitive' in json_doc: # my_text5 = json_doc["possibly_sensitive"] # clean_my_text5 = c.cleanText(str(my_text5)) # json_doc.update([ ("possibly_sensitive", clean_my_text5) ]) if 'in_reply_to_screen_name' in json_doc: my_name = json_doc["in_reply_to_screen_name"] clean_my_name = c.cleanText(my_name) json_doc.update([("in_reply_to_screen_name", clean_my_name)]) if 'user_mentions' in json_doc["entities"]: for usr in json_doc["entities"]['user_mentions']: my_name1 = usr["name"] clean_my_name1 = c.cleanText(my_name1) usr.update([("name", clean_my_name1)]) my_name2 = usr["screen_name"] clean_my_name2 = c.cleanText(my_name2) usr.update([("screen_name", clean_my_name2)]) for usr in json_doc["entities"]['urls']: my_name3 = usr["url"] clean_my_name3 = c.cleanText(my_name3) usr.update([("url", clean_my_name3)]) my_name4 = usr["expanded_url"] clean_my_name4 = c.cleanText(my_name4) usr.update([("expanded_url", clean_my_name4)]) my_name5 = usr["display_url"] clean_my_name5 = c.cleanText(my_name5) usr.update([("display_url", clean_my_name5)]) #if 'media' in json_doc: # my_media = json_doc["media"] # if 'additional_media_info' in my_media: # my_name6 = my_media["additional_media_info"] # clean_my_name6 = c.cleanText(my_name6) # my_media.update([ ("additional_media_info", clean_my_name6) ]) # add sentiment json_doc.update([("mySentiment", sentiment[0])]) json_doc.update([("sentPositive", sentiment[1])]) json_doc.update([("sentNegative", sentiment[2])]) # add load_type, used later for filter json_doc.update([("load_type", whatStuff)]) json_doc.update([("source_type", "twitter")]) new_doc = str(json_doc).replace("'", '"') #print (new_doc) new_doc = new_doc.replace("False", "false") new_doc = new_doc.replace("True", "true") yield {"_index": _index, "_id": uuid.uuid4(), "_source": new_doc}