def get_questions_detail(self): #只搜集符合一定条件的问题详细内容,如关注者/回答数量等超过一定值,但本值不应较大 #如果mongodb中已经存了回答内容,那么也不考虑作者更新回答的情况,因为答案通常较长,IO成本较高 all_answers_id = pd.DataFrame(list(self.db_[TOPICS_COLLECTION].find())) for answers in all_answers_id['aids']: #提取每个问题对应的所有回答id for ans_id in answers: ans = self.client_.answer(ans_id) ans_content = Cleaner.filter_tags(ans.content) comment_content = "" if (ans.comment_count > 0): for comment in ans.comments: comment_content += Cleaner.filter_tags( comment.content) + "@" #用@分割,以后可以选择直接测情绪也可以每句评论分开测 time.sleep(random.randint(1, 4)) print("ans {} done retrival and cleaning".format(ans.id)) ans_detail = { 'aid': ans_id, 'votes': ans.voteup_count, 'content': ans_content, 'comments': comment_content, 'author_follower_num': ans.author.follower_count } self.db_[ANSWERS_COLLECTION].insert_one(ans_detail)
def CountModel(StatusDataRdd): # Create or get spark and SQL Context and broadcast stop words, Apostrophes word and life events list. SparkContext = ContextProvider.getSparkInstance() SqlContext = ContextProvider.getSQLContext() StopWordsList = SparkContext.broadcast(FileContentLoader.LoadStopWords()) ApostrophesReplaceList = SparkContext.broadcast( FileContentLoader.LoadApostrophesReplaceWords()) LifeEventsList = SparkContext.broadcast(FileContentLoader.LifeEventsList()) # Clean the tweet text and get a barebone words list with only very important words StatusData = StatusDataRdd.collect()[0] ClanedTweetText = Cleaner.ReplaceApostrophes( ApostrophesReplaceList.value, Cleaner.RemovePunctuations(StatusData["text"])) BareTweetWords = Cleaner.RemoveStopWords(StopWordsList.value, ClanedTweetText).split() DetectedTopic = [] # Get the tweets date TweetDate = datetime.strptime( StatusData["created_at"][:19] + StatusData["created_at"][25:], '%a %b %d %X %Y') # Identify if a tweet belongs to a particular life event for Event in LifeEventsList.value["LifeEventList"]: if Event["Event"] in BareTweetWords or Event[ "StemWords"] in BareTweetWords: DetectedTopic.append({ "Event": Event["Event"], "date": TweetDate.strftime('%m/%d/%Y'), "Id": StatusData["id"] }) # saves none if a tweet does not belongs to any life event if DetectedTopic == []: DetectedTopic.append({ "Event": "None", "date": TweetDate.strftime('%m/%d/%Y'), "Id": StatusData["id"] }) # Saves output in a parquet file TweetDataFrame = SqlContext.createDataFrame( SparkContext.parallelize(DetectedTopic)) TweetDataFrame.coalesce( ConfigProvider.MaxPartFiles).write.mode('append').parquet( ConfigProvider.OutputParquetFilePath) # for saving to HBase # HBaseInsert = [] # for Topic in DetectedTopic: # HBaseInsert += [(Topic["Id"] ,[Topic["Id"], "cf", "Topic", Topic["topic"]])] # HBaseInsert += [(Topic["Id"] ,[Topic["Id"], "cf", "Date", Topic["date"]])] # print HBaseInsert # HbaseSave.SaveRecord(SparkContext.parallelize(HBaseInsert), "LifeEventCount")
def __init__(self, file_name, verbose=False): self.cleaner = Cleaner() self.cleaner.clean(file_name) self.nlp = NLP() self.data = [] self.file_name = file_name self.cnt_dirty_data = 0 self.cnt_corefs = 0 with open(file_name, encoding="utf-8") as f: content = f.read() try: structure = xmltodict.parse(content) except: show_var(["file_name"]) import pdb pdb.set_trace() for entry_ix, entry in enumerate( self._triples_from_obj(structure["benchmark"]["entries"], "entry")): self.entry_ix = entry["@eid"] triplets = [ tuple(map(str.strip, r.split("|"))) for r in self._triples_from_obj( entry["modifiedtripleset"], "mtriple") ] entitymaps = dict([ tuple(map(str.strip, entitymap.split("|"))) for entitymap in self._triples_from_obj( entry["entitymap"], "entity") ]) sentences = list(self.extract_sentences(entry["lex"])) for s_tripleset, text, template, ner2ent in sentences: self.data.append({ # 'rdfs': triplets, "triples": s_tripleset, "target": template, "target_txt": text, "ner2ent": ner2ent, }) if verbose and self.cnt_dirty_data: show_var(["self.cnt_dirty_data"]) if verbose and self.cnt_corefs: show_var(["self.cnt_corefs"])
def processTweets(RawTweet): try: RawTweet["text"] = RawTweet["text"].encode("ascii","ignore") # Remove Symbols from tweet text for symbol in RawTweet["entities"]["symbols"]: StartIndex = int(symbol["indices"][0]) EndIndex = int(symbol["indices"][1]) ReplaceLen = EndIndex - StartIndex RawTweet["text"] = RawTweet["text"][ : StartIndex ] + ReplaceLen * " " + RawTweet["text"][EndIndex : ] # Remove Hashtags from tweet text for hashtag in RawTweet["entities"]["hashtags"]: StartIndex = int(hashtag["indices"][0]) EndIndex = int(hashtag["indices"][1]) ReplaceLen = EndIndex - StartIndex RawTweet["text"] = RawTweet["text"][ : StartIndex ] + ReplaceLen * " " + RawTweet["text"][EndIndex : ] # Remove Url's from tweet text for url in RawTweet["entities"]["urls"]: StartIndex = int(url["indices"][0]) EndIndex = int(url["indices"][1]) ReplaceLen = EndIndex - StartIndex RawTweet["text"] = RawTweet["text"][ : StartIndex ] + ReplaceLen * " " + RawTweet["text"][EndIndex : ] # Remove media content like image links from tweet text if "media" in RawTweet["entities"] and RawTweet["entities"]["media"] is not None: for MediaUrl in RawTweet["entities"]["media"]: RawTweet["text"] = RawTweet["text"].replace(MediaUrl["url"],"") # Replaces user mentions with their screen names for user in RawTweet["entities"]["user_mentions"]: RawTweet["text"] = RawTweet["text"].replace("@"+user["screen_name"] + ":" , " " + user["name"].encode("ascii","ignore") + " ") # Performs a general text cleaning removing unwanted characters RawTweet["text"] = Cleaner.TextCleaner(RawTweet["text"]) return RawTweet except: print "Error cleaning tweets from json file\n " , sys.exc_info()[0]
def latexerapi(): image = request.args.get("image") try: expr = pytesseract.image_to_string( image, lang='eng', output_type=pytesseract.Output.DICT)['text'] logger.debug('Opened {}.'.format(image)) except: logger.error( 'Failed to open {}, make sure you\'ve installed the tesseract-ocr.' .format(image)) return json.dumps({ 'image': image, 'status': 'failed', 'latex': '' }, indent=4) res = Latexer(Cleaner(expr)) return json.dumps({ 'image': image, 'status': res.status, 'latex': str(res) }, indent=4)
class RDFFileReader: def __init__(self, file_name, verbose=False): self.cleaner = Cleaner() self.cleaner.clean(file_name) self.nlp = NLP() self.data = [] self.file_name = file_name self.cnt_dirty_data = 0 self.cnt_corefs = 0 with open(file_name, encoding="utf-8") as f: content = f.read() try: structure = xmltodict.parse(content) except: show_var(['file_name']) import pdb pdb.set_trace() for entry_ix, entry in enumerate( self._triples_from_obj(structure["benchmark"]["entries"], "entry")): self.entry_ix = entry['@eid'] triplets = [ tuple(map(str.strip, r.split("|"))) for r in self._triples_from_obj( entry["modifiedtripleset"], "mtriple") ] entitymaps = dict([ tuple(map(str.strip, entitymap.split("|"))) for entitymap in self._triples_from_obj( entry['entitymap'], 'entity') ]) sentences = list(self.extract_sentences(entry["lex"])) for s_tripleset, text, template, ner2ent in sentences: self.data.append({ # 'rdfs': triplets, 'triples': s_tripleset, 'target': template, 'target_txt': text, 'ner2ent': ner2ent, }) if verbose and self.cnt_dirty_data: show_var(["self.cnt_dirty_data"]) if verbose and self.cnt_corefs: show_var(["self.cnt_corefs"]) @staticmethod def _triples_from_obj(obj, t_name): def _triples_fix(triplets): if not isinstance(triplets, list): return [triplets] else: return map(lambda t: t, triplets) if not isinstance(obj, list): if obj is not None: if t_name in obj: return _triples_fix(obj[t_name]) return [] else: return [_triples_fix(o[t_name]) for o in obj] def extract_sentences(self, lex): sentences = lex if not isinstance(sentences, list): sentences = [sentences] for s in sentences: if s['@comment'] == 'bad': continue template = s['template'] text = s['text'] tag2ent = dict([ (r['@tag'], r['@entity']) for r in self._triples_from_obj(s['references'], 'reference') ]) s_tripleset_raw = [[ tuple(map(str.strip, r.split("|"))) for r in self._triples_from_obj(s_triples, 'striple') ] for s_triples in self._triples_from_obj( s["sortedtripleset"], 'sentence') if s_triples] fixed = self.fix_document(s_tripleset_raw, template, text, tag2ent) if fixed is None: continue s_tripleset, template, text, tag2ent = fixed if len(s_tripleset) == 1: template = [template] text = [text] else: template = self.nlp.sent_tokenize(template) text = self.nlp.sent_tokenize(text) text = fix_tokenize(text) if len({len(template), len(text), len(s_tripleset)}) != 1: # import pdb; # pdb.set_trace() self.cnt_dirty_data += 1 continue for s_t, tex, tem in zip(s_tripleset, text, template): new_s_t, tem, uniq_tag2ent = \ self.fix_sentence(s_t, tem, tag2ent) if not (new_s_t and tem and tex and uniq_tag2ent): self.cnt_corefs += 1 # import pdb;pdb.set_trace() continue yield new_s_t, tex, tem, uniq_tag2ent def fix_document(self, s_tripleset_raw, template, text, tag2ent): # check template template = ' '.join( [fix_template_word[word] if word in fix_template_word else word for word in template.split()]) \ if template else template # tokenization text = self.nlp.word_tokenize(text) template = self.nlp.word_tokenize(template) # clean s_tripleset s_tripleset = [s for s in s_tripleset_raw if s] self.cnt_dirty_data += len(s_tripleset_raw) - len(s_tripleset) if (not tag2ent) or (not s_tripleset): self.cnt_dirty_data += not tag2ent return None # fix this case "same entity has different ners BRIDGE-1 PATIENT-1" ent2tags = defaultdict(list) for tag, ent in tag2ent.items(): ent2tags[ent] += [tag] tag2uniq_tag = {} for ent, tags in ent2tags.items(): for tag in tags: tag2uniq_tag[tag] = tags[0] uniq_tag2ent = { tag: ent for tag, ent in tag2ent.items() if tag in tag2uniq_tag.values() } for tag, uniq_tag in tag2uniq_tag.items(): template = template.replace(tag, uniq_tag) assert uniq_tag2ent ent2uniq_tag = {v: k for k, v in uniq_tag2ent.items()} assert len(ent2uniq_tag) == len(uniq_tag2ent) # clean out extra quotes around entity names uniq_tag2ent = {k: v.strip('\"') for k, v in uniq_tag2ent.items()} try: s_tripleset = [[(subj.strip('\"'), predi, obj.strip('\"')) for subj, predi, obj in s_triples] for s_triples in s_tripleset] except: import pdb pdb.set_trace() # replaces '-' with '_' only in entity types tags = set(uniq_tag2ent.keys()) for tag in tags: template = template.replace(tag, tag.replace('-', '_')) template = template.replace('BRIDGE-', 'BRIDGE_') template = template.replace('AGENT-', 'AGENT_') template = template.replace('PATIENT-', 'PATIENT_') uniq_tag2ent = { k.replace('-', '_'): v for k, v in uniq_tag2ent.items() } return s_tripleset, template, text, uniq_tag2ent def fix_sentence(self, s_tripleset, template, tag2ent): ent2tags = {v: k for k, v in tag2ent.items()} # s_tripleset must meet "head && tail are in template && tag2ent" bad_triples = set() for triple_ix, triple in enumerate(s_tripleset): for ent in [triple[0], triple[-1]]: if ent in ent2tags: if ent2tags[ent] not in template: bad_triples.add(triple_ix) continue else: bad_triples.add(triple_ix) continue s_tripleset = [ triple for triple_ix, triple in enumerate(s_tripleset) if triple_ix not in bad_triples ] # tag2ent are entities only in triple_entities triple_entities = set( flatten_list([(triple[0], triple[-1]) for triple in s_tripleset])) tag2tri_ent = { k: v for k, v in tag2ent.items() if v in triple_entities } # templates only have triple_entities for tag, ent in tag2ent.items(): if ent not in triple_entities: ent = ent.replace('_', ' ') template = template.replace(tag, ent) if {word for word in template.split() if 'AGENT' in word or 'BRIDGE' in word or 'PATIENT' in word} \ != set(tag2tri_ent.keys()): self.cnt_corefs += 1 assert set(tag2tri_ent.values()) == triple_entities ''' TODO: Erroraneous case: train.csv:7123:"Ayam penyet mainIngredients Squeezed"" or ""smashed"" fried chicken served with sambal",PATIENT_2 is PATIENT_3 .,"Fried chicken is Squeezed"" or ""smashed"" fried chicken served with sambal .",The chicken is smashed and served hot with sambal .,"Ayam penyet Fried chicken Squeezed"" or ""smashed"" fried chicken served with sambal",AGENT_1 PATIENT_2 PATIENT_3,ROOT mainIngredients mainIngredients_inv,mainIngredients,"[0, 2]","[2, 2, 8]","{""AGENT_1"": ""Ayam penyet"", ""PATIENT_2"": ""Fried chicken"", ""PATIENT_3"": ""Squeezed\"" or \""smashed\"" fried chicken served with sambal""}","[[0, 4], [4, 2], [2, 5], [5, 0]]","Ayam penyet <ENT_SEP> Fried chicken <ENT_SEP> Squeezed"" or ""smashed"" fried chicken served with sambal <ENT_REL_SEP> mainIngredients <REL_TRP_SEP> 0 2 0","Ayam penyet mainIngredients Squeezed"" or ""smashed"" fried chicken served with sambal <ENT_TGT_SEP> PATIENT_2 is PATIENT_3 . <TGT_TXT_SEP> The chicken is smashed and served hot with sambal ." train.csv:7359:Bakewell tart ingredient Frangipane,AGENT_1 contains PATIENT_3 .,Bakewell pudding contains Frangipane .,It contains frangipane .,Bakewell pudding Bakewell tart Frangipane,AGENT_1 BRIDGE_2 PATIENT_3,ROOT ingredient ingredient_inv,ingredient,"[1, 2]","[2, 2, 1]","{""AGENT_1"": ""Bakewell pudding"", ""BRIDGE_2"": ""Bakewell tart"", ""PATIENT_3"": ""Frangipane""}","[[1, 4], [4, 2], [2, 5], [5, 1]]",Bakewell pudding <ENT_SEP> Bakewell tart <ENT_SEP> Frangipane <ENT_REL_SEP> ingredient <REL_TRP_SEP> 1 2 0,Bakewell tart ingredient Frangipane <ENT_TGT_SEP> AGENT_1 contains PATIENT_3 . <TGT_TXT_SEP> It contains frangipane . { "sent": "demarce short stories in the the grantville gazettes precede eric flint novels .", "graph": [ { "truth": "precededBy", "pred": "precededBy", "ent0_ent1": "1634: the bavarian crisis ENT0_END demarce short stories in the the grantville gazettes" }, { "truth": "<unk>", "pred": "author", "ent0_ent1": "1634: the bavarian crisis ENT0_END eric flint" } ] } ''' return s_tripleset, template, tag2tri_ent
#df = pd.DataFrame(topic_questions_detail,columns=['tid','qid','aids']) mogo_client = MongoClient('mongodb://localhost:27017/') db = mogo_client['test'] col = db['questions_detail'] print(col.count()) #target = col.find_one({'tid' : 19575211 }) target = pd.DataFrame(list(col.find())) #print(target.describe()) print(target) for answers in target['aids']: for ans_id in answers: ans = client.answer(ans_id) print(Cleaner.filter_tags(ans.content)) time.sleep(3) # for q in topic.unanswered_questions: # if(q.follower_count > 1000): # #print("question {}, created at {}, has {} followers, {} answers\n".format(q.title, datetime.utcfromtimestamp(q.created_time).strftime('%Y-%m-%d %H:%M:%S'), q.follower_count, q.answer_count)) # for ans in q.answers: # for com in ans.comments: # print("question {} - answer {} {}- comments {}\n".format(q.id, ans.id, ans.content, com.content))
'-im', type=str, required=True, help='Input Image') parser.add_argument('--verbose', '-v', action='store_true', help='Show debugging data.') args = parser.parse_args() if not args.verbose: logger.disable('utils') logger.disable('__main__') try: expr = pytesseract.image_to_string( args.image, lang='eng', output_type=pytesseract.Output.DICT)['text'] logger.debug('Opened {}.'.format(args.image)) except: logger.error( 'Failed to open {}, make sure you\'ve installed the tesseract-ocr.' .format(args.image)) exit( 'Failed to open {}, make sure you\'ve installed the tesseract-ocr.' ) res = Latexer(Cleaner(expr)) print(res)