Exemple #1
0
    def data_iterator(self, _read_reviews=False, _find_words=False):
        cnt = 0
        with codecs.open(self._datapath, 'r', 'utf-8') as f:
            for line in f:
                if not line: continue
                try:
                    prod = json.loads(line)
                except ValueError:
                    excinfo = sys.exc_info()
                    traceback.print_exception(excinfo[0], excinfo[1],
                                              excinfo[2])
                    pdb.set_trace()
                if prod['reviews'] is None:
                    continue
                cnt += 1
                if 0 <= self._maxline <= cnt:
                    break

                # Preprocess reviews
                review = None
                if _read_reviews:
                    reviews = self._read_reviews(prod['reviews'])
                    review = Reviews(self.nlp,
                                     prod['asin'],
                                     reviews,
                                     sent_split=True)
                    if len(review.r_docs) < self.num_reviews:
                        continue  # Filter out products with insufficient number of reviews

                # Preprocess question-answer
                qa_list = []
                review_focus_mask = []
                for qid, qa in enumerate(prod['qa']):
                    question = self._read_question(qa)
                    answers = self._read_answers(qa)
                    qa = QA(self.nlp, prod["asin"], qid, question, answers)

                    if len(qa.a_docs) == 0:
                        continue
                    qa_list.append(qa)

                    if _find_words:
                        # Extract question focus words
                        qa.extract_words()
                        # Match QA with reviews to find out which words among reviews should be focused on
                        review_focus_mask.append(
                            review.extract_words(qa.q_token_set,
                                                 qa.a_token_set))

                yield (review, qa_list, review_focus_mask)
Exemple #2
0
def merge_linked_qa(qa_pairs, link_sets):
    print '\nMERGE LINKED QA'
    merged_qa_pairs = []

    for link_set in link_sets:
        linked_qa = QA()

        for pair in qa_pairs:
            q = pair.questions[0]
            a = pair.answers

            if q.id in link_set:
                linked_qa.questions.append(q)
                linked_qa.answers.extend(a)
                pair.is_link = True

        if len(linked_qa.questions) > 1:
            merged_qa_pairs.append(linked_qa)

    for pair in qa_pairs:
        if pair.is_link is False:
            merged_qa_pairs.append(pair)

    print 'Merged QA Pairs: %d' % len(merged_qa_pairs)
    return merged_qa_pairs
Exemple #3
0
def get_qa_pairs(questions, answers, max_post_id):
    print '\nGET QA PAIRS'

    print '\tCreating QA matrix...'
    qa_matrix = [None for i in xrange(max_post_id + 1)]
    for i, q in enumerate(questions):
        if i > 0 and i % 1000 == 0:
            print i,
            sys.stdout.flush()

        qa = QA()
        qa.questions.append(Question(q))
        qa_matrix[int(q.attrib['Id'])] = qa

    print '\n\tMatching questions and answers...'
    qa_matrix_len = len(qa_matrix)
    for i, a in enumerate(answers):
        if i > 0 and i % 1000 == 0:
            print i,
            sys.stdout.flush()

        try:
            parent_id = int(a.attrib['ParentId'])
            if qa_matrix_len > parent_id:
                qa = qa_matrix[parent_id]
                if qa is not None:
                    qa.answers.append(Answer(a))
        except:
            continue

    count = 0
    for qa in qa_matrix:
        if qa is not None and len(qa.answers) > 0:
            count += 1

    print '\nQA Pairs: %d' % count
    return qa_matrix
Exemple #4
0
# -*- coding: utf-8 -*-

from qa import QA

doc_dir = "../docs"
q = QA(doc_dir)

res = q.predict("2017年“9元享看”活动开展时间?", 2, 3)
print(res)

res = q.predict("365活动线上和线下一共可以有几次权益?", 2, 3)
print(res)

res = q.predict_single_doc("365活动线上和线下一共可以有几次权益?",
                           "精彩365线上平台(餐券平台)活动知识库原文档.docx", 3)
print(res)

# Tips: 问题最好包含文档标题的关键字
Exemple #5
0
def pr_family_to_googlefonts(repo_url,
                             license,
                             fonts,
                             upstream_commit,
                             qa_out,
                             html_snippet=None):
    """Send a family pr to a google/fonts repo"""
    logger.info('Running preflight')
    qa = QA(license, fonts, qa_out)
    qa.preflight()

    #2 Package
    if not qa.passed_preflight:
        # TODO (M Foley) Submit a git issue to the upstream repo
        logger.info('Failed preflight')
        return

    repo = GFRepo()
    family_name = get_repo_family_name(fonts)

    if repo.has_family(family_name):
        logger.info('Family already exists. Replacing files')
        family = repo.get_family(family_name)
        family.replace_fonts(fonts)
        family.replace_file(license)
        family.update_metadata()
    else:
        logger.info('Family does not exist. Adding files')
        family = repo.new_family(license, family_name)
        family.add_fonts(fonts)
        family.add_file(license)
        if html_snippet:
            family.add_file(html_snippet)
        family.generate_metadata(input_designer=True, input_category=True)

    #3 QA
    qa.update_paths(fonts)
    logger.info('Running fonts through FontBakery')
    qa.fontbakery()

    qa.passed = True  # When FB gets better tests, remove this.
    #3 QA: Regression Testing
    if family.is_updated and qa.passed:
        logger.info('Regression testing against fonts hosted on Google Fonts')
        qa.diffbrowsers_family_update()
    elif not family.is_updated and qa.passed:
        logger.info('Generating screenshots')
        qa.diffbrowsers_new_family()
        # TODO (M Foley) improve diffenator up to R Sheeter's spec
        # logger.info('Generating before and after images')
        # qa.diffenator()

    #4 PR
    if qa.passed:
        logger.info('QA passed. commiting fonts to {}'.format(
            SETTINGS['local_gf_repo_path']))
        commit_msg = repo.commit(family_name, repo_url, upstream_commit)
        # push to google/fonts. We need a bot
        logger.info('PRing fonts to {}. Be patient'.format(
            SETTINGS['local_gf_repo_path']))
        repo.pull_request(commit_msg, qa.fb_report, qa.diffbrowsers_report,
                          qa.images, qa.path, qa.gfr_url)
    else:
        logger.info('QA failed. FB reported the following errors\n{}\n\n'
                    'Project state will be reset to avoid damage'.format(
                        json.dumps(qa.failed_tests, indent=4)))
Exemple #6
0
    def get_qas(self, class_name="item-question"):
        questions = self.driver.driver.find_elements_by_class_name(class_name)

        for question in questions:
            q_a = QA()

            try:
                q_a.company_name = question.find_element_by_class_name(
                    "companyName").text
            except:
                q_a.company_name = "NONE"

            try:
                q_a.company_code = question.find_element_by_class_name(
                    "company-code").text
            except:
                q_a.company_code = "NONE"

            try:
                q_a.qa_time = question.find_element_by_class_name(
                    "question-time").text
            except:
                q_a.qa_time = "NONE"

            try:
                q_a.question = question.find_element_by_class_name(
                    "question-content").text
            except:
                q_a.question = "NONE"

            try:
                q_a.answer = question.find_element_by_class_name(
                    "reply-content").text
            except:
                q_a.answer = "NONE"

            self.qas.append(q_a)
Exemple #7
0
    def get_qas(self, class_name="m_feed_item"):
        questions = self.driver.driver.find_elements_by_class_name(class_name)

        for question in questions:
            q_a = QA()

            try:
                question_q       = question.find_elements_by_class_name("m_feed_txt")[0].text
                q_a.company_name = (question_q.split(')')[0]).split('(')[0].strip().replace(':', '')
                q_a.company_code = (question_q.split(')')[0]).split('(')[1].strip()
                q_a.question     = (question_q.split(')')[1]).strip()
            except:
                q_a.company_name = "NONE"
                q_a.company_code = "NONE"
                q_a.question     = "NONE"

            try:
                q_a.q_time       = question.find_elements_by_class_name("m_feed_from")[0].text
            except:
                q_a.q_time       = "NONE"

            try:
                q_a.answer = question.find_elements_by_class_name("m_feed_txt")[1].text
                q_a.a_time = question.find_elements_by_class_name("m_feed_from")[1].text
            except:
                q_a.answer = "NONE"
                q_a.a_time = "NONE"

            self.qas.append(q_a)