Ejemplo n.º 1
0
def convertToCSV(article):
    print(article["Title"])
    article_cn = text_maker.handle(article["Content"])
    article_raw = chinese_nlp.convert2simplified(article_cn)
    matched_words = chinese_tagging_api.matching(article["Content"], companies)
    article_en = article_raw
    for word in matched_words:
        for tag in word['Matched']:
            article_en = article_en.replace(tag, " " + word["Name_EN"] + " ")

    try:
        article_en = translator.translate(article_en).text
    except:
        article_en = ""
        print("Article cannot be translated!")

    if article_en:
        article_en = article_en.replace("., ", ".,")
        for word in matched_words:
            word['Name_EN'] = word['Name_EN'].replace('., ', '.,')
            for tag in word['Matched']:
                if word['Name_EN'] in article_en:
                    score = sum(1 for _ in
                                re.finditer(chinese_nlp.convert2simplified(tag), article_raw)) / float(
                        len(article))
                    writer1.writerow(
                        [article["Article_ID"].replace(":",""), word['PermID'], word['RIC'], tag, score, word['Name'],
                         word['Name_EN']])
                else:
                    print("Translation Error!")

        writer2.writerow([article["Article_ID"].replace(":",""), article_cn, article_en, article["Title"], article["Source"],
                          datetime.strptime(article["Time"], "%Y-%m-%dT%H:%M:%S").strftime('%d %b %Y %I:%M%p'), "https://www.thomsonreuters.com/en.html"])
    else:
        print("Jump")
Ejemplo n.º 2
0
def load_company_names(file, sheet, exclude_alias):
    workbook = px.load_workbook(file)
    sheet = workbook.get_sheet_by_name(name=sheet)
    with open(exclude_alias, encoding="utf-8") as f:
        exclude = set(
            [chinese_nlp.convert2simplified(l.split()[0]) for l in iter(f)])
    companies = []
    for row in sheet.iter_rows():
        row1 = []
        for cell in row:
            row1.append(cell.internal_value)
        # row2 = [x for x in row1 if not pd.isnull(x)]
        row2 = row1
        companies.append(row2)
    companies = companies[1:]
    companies1 = {}
    for company in companies:
        key = str(company[0])
        companies1[key] = {}
        companies1[key]['PermID'] = company[0]
        companies1[key]['RIC'] = company[1]
        companies1[key]['Name'] = get_company_name(company)
        if company[3]:
            companies1[key]['Name_EN'] = company[3]
        else:
            companies1[key]['Name_EN'] = company[2]

        companies1[key]['lexicon'] = set(company[2:])
        companies1[key]['lexicon_simp'] = set([
            chinese_nlp.convert2simplified(x)
            for x in companies1[key]['lexicon']
        ]) - exclude
    return companies1
Ejemplo n.º 3
0
def matching(text, companies):
    input_text = dict()
    input_text['text'] = text
    input_text['text_simp'] = chinese_nlp.convert2simplified(
        input_text['text'])
    matched = []
    w = chinese_nlp.pseg_2gram(input_text['text_simp'])
    words = set(w['words'])
    # words1 = Counter(w['words'])
    for k in companies:
        matched1 = words & companies[k]['lexicon_simp']
        if len(matched1) > 0:
            # companies[k]["Matched"] = ",".join(matched1)
            companies[k]["Matched"] = matched1
            matched.append(companies[k])
    # matched_occur = {}
    # for id in matched:
    #     matched_occur[id] = []
    #     for word in matched[id]:
    #         matched_occur[id] += [(x.start(), x.end()) for x in re.finditer(word, input_text['text_simp'])]
    # input_text['matched'] = matched
    # input_text['matched_occur'] = matched_occur
    return matched
Ejemplo n.º 4
0
                # except:
                #     title = ""
                # try:
                #     body = soup.find('body').text
                #     if "Reuters" in body:
                #         source = "Reuters"
                #     else:
                #         source = "Not Reuters"
                # except:
                #     body = ""
                #     source = "Not Reuters"
                # article = html2text.html2text(body)
                # if article.startswith("### "):
                #     article = article.replace('### ', "")
                #
                article_sim = chinese_nlp.convert2simplified(article)
                article_en = article
                matched_words = chinese_tagging_api.matching(
                    article, companies)

                for word in matched_words:
                    for tag in word['Matched']:
                        article_en = article_en.replace(
                            tag, " " + word["Name_EN"] + " ")

                try:
                    article_en = translator.translate(article_en).text
                except:
                    article_en = ""
                    print("Error!")