def data_iterator(self, _read_reviews=False, _find_words=False): cnt = 0 with codecs.open(self._datapath, 'r', 'utf-8') as f: for line in f: if not line: continue try: prod = json.loads(line) except ValueError: excinfo = sys.exc_info() traceback.print_exception(excinfo[0], excinfo[1], excinfo[2]) pdb.set_trace() if prod['reviews'] is None: continue cnt += 1 if 0 <= self._maxline <= cnt: break # Preprocess reviews review = None if _read_reviews: reviews = self._read_reviews(prod['reviews']) review = Reviews(self.nlp, prod['asin'], reviews, sent_split=True) if len(review.r_docs) < self.num_reviews: continue # Filter out products with insufficient number of reviews # Preprocess question-answer qa_list = [] review_focus_mask = [] for qid, qa in enumerate(prod['qa']): question = self._read_question(qa) answers = self._read_answers(qa) qa = QA(self.nlp, prod["asin"], qid, question, answers) if len(qa.a_docs) == 0: continue qa_list.append(qa) if _find_words: # Extract question focus words qa.extract_words() # Match QA with reviews to find out which words among reviews should be focused on review_focus_mask.append( review.extract_words(qa.q_token_set, qa.a_token_set)) yield (review, qa_list, review_focus_mask)
def merge_linked_qa(qa_pairs, link_sets): print '\nMERGE LINKED QA' merged_qa_pairs = [] for link_set in link_sets: linked_qa = QA() for pair in qa_pairs: q = pair.questions[0] a = pair.answers if q.id in link_set: linked_qa.questions.append(q) linked_qa.answers.extend(a) pair.is_link = True if len(linked_qa.questions) > 1: merged_qa_pairs.append(linked_qa) for pair in qa_pairs: if pair.is_link is False: merged_qa_pairs.append(pair) print 'Merged QA Pairs: %d' % len(merged_qa_pairs) return merged_qa_pairs
def get_qa_pairs(questions, answers, max_post_id): print '\nGET QA PAIRS' print '\tCreating QA matrix...' qa_matrix = [None for i in xrange(max_post_id + 1)] for i, q in enumerate(questions): if i > 0 and i % 1000 == 0: print i, sys.stdout.flush() qa = QA() qa.questions.append(Question(q)) qa_matrix[int(q.attrib['Id'])] = qa print '\n\tMatching questions and answers...' qa_matrix_len = len(qa_matrix) for i, a in enumerate(answers): if i > 0 and i % 1000 == 0: print i, sys.stdout.flush() try: parent_id = int(a.attrib['ParentId']) if qa_matrix_len > parent_id: qa = qa_matrix[parent_id] if qa is not None: qa.answers.append(Answer(a)) except: continue count = 0 for qa in qa_matrix: if qa is not None and len(qa.answers) > 0: count += 1 print '\nQA Pairs: %d' % count return qa_matrix
# -*- coding: utf-8 -*- from qa import QA doc_dir = "../docs" q = QA(doc_dir) res = q.predict("2017年“9元享看”活动开展时间?", 2, 3) print(res) res = q.predict("365活动线上和线下一共可以有几次权益?", 2, 3) print(res) res = q.predict_single_doc("365活动线上和线下一共可以有几次权益?", "精彩365线上平台(餐券平台)活动知识库原文档.docx", 3) print(res) # Tips: 问题最好包含文档标题的关键字
def pr_family_to_googlefonts(repo_url, license, fonts, upstream_commit, qa_out, html_snippet=None): """Send a family pr to a google/fonts repo""" logger.info('Running preflight') qa = QA(license, fonts, qa_out) qa.preflight() #2 Package if not qa.passed_preflight: # TODO (M Foley) Submit a git issue to the upstream repo logger.info('Failed preflight') return repo = GFRepo() family_name = get_repo_family_name(fonts) if repo.has_family(family_name): logger.info('Family already exists. Replacing files') family = repo.get_family(family_name) family.replace_fonts(fonts) family.replace_file(license) family.update_metadata() else: logger.info('Family does not exist. Adding files') family = repo.new_family(license, family_name) family.add_fonts(fonts) family.add_file(license) if html_snippet: family.add_file(html_snippet) family.generate_metadata(input_designer=True, input_category=True) #3 QA qa.update_paths(fonts) logger.info('Running fonts through FontBakery') qa.fontbakery() qa.passed = True # When FB gets better tests, remove this. #3 QA: Regression Testing if family.is_updated and qa.passed: logger.info('Regression testing against fonts hosted on Google Fonts') qa.diffbrowsers_family_update() elif not family.is_updated and qa.passed: logger.info('Generating screenshots') qa.diffbrowsers_new_family() # TODO (M Foley) improve diffenator up to R Sheeter's spec # logger.info('Generating before and after images') # qa.diffenator() #4 PR if qa.passed: logger.info('QA passed. commiting fonts to {}'.format( SETTINGS['local_gf_repo_path'])) commit_msg = repo.commit(family_name, repo_url, upstream_commit) # push to google/fonts. We need a bot logger.info('PRing fonts to {}. Be patient'.format( SETTINGS['local_gf_repo_path'])) repo.pull_request(commit_msg, qa.fb_report, qa.diffbrowsers_report, qa.images, qa.path, qa.gfr_url) else: logger.info('QA failed. FB reported the following errors\n{}\n\n' 'Project state will be reset to avoid damage'.format( json.dumps(qa.failed_tests, indent=4)))
def get_qas(self, class_name="item-question"): questions = self.driver.driver.find_elements_by_class_name(class_name) for question in questions: q_a = QA() try: q_a.company_name = question.find_element_by_class_name( "companyName").text except: q_a.company_name = "NONE" try: q_a.company_code = question.find_element_by_class_name( "company-code").text except: q_a.company_code = "NONE" try: q_a.qa_time = question.find_element_by_class_name( "question-time").text except: q_a.qa_time = "NONE" try: q_a.question = question.find_element_by_class_name( "question-content").text except: q_a.question = "NONE" try: q_a.answer = question.find_element_by_class_name( "reply-content").text except: q_a.answer = "NONE" self.qas.append(q_a)
def get_qas(self, class_name="m_feed_item"): questions = self.driver.driver.find_elements_by_class_name(class_name) for question in questions: q_a = QA() try: question_q = question.find_elements_by_class_name("m_feed_txt")[0].text q_a.company_name = (question_q.split(')')[0]).split('(')[0].strip().replace(':', '') q_a.company_code = (question_q.split(')')[0]).split('(')[1].strip() q_a.question = (question_q.split(')')[1]).strip() except: q_a.company_name = "NONE" q_a.company_code = "NONE" q_a.question = "NONE" try: q_a.q_time = question.find_elements_by_class_name("m_feed_from")[0].text except: q_a.q_time = "NONE" try: q_a.answer = question.find_elements_by_class_name("m_feed_txt")[1].text q_a.a_time = question.find_elements_by_class_name("m_feed_from")[1].text except: q_a.answer = "NONE" q_a.a_time = "NONE" self.qas.append(q_a)