Exemple #1
0
def test_double_cleanups():
    expected = "Hello World"
    errmsg = "cleanup of %s failed"
    for testname, variation in multi_cleanup_tests.items():
        result = basename(variation, prefix=True, suffix=True, middle=True)
        final = basename(result, prefix=True, suffix=True, middle=True)

        assert final == expected, errmsg % testname
def get_ticker_symbol(security, securities):
    ''' 
		This function will return ticker symbol for security and securities. Since the training examples does not have them, this function
		scrapes ticker symbole from marketywatch website. 
	'''

    sec = basename(
        security, terms, prefix=False, middle=False, suffix=True).lower(
        )  #This is to remove thins like Inc., Ltd. fromt he comapny name

    if sec == 'google':
        ## I have hardcoded it here because Google is being reffered by its parent company Alphabet, yet a lot of people still use google.
        security = 'GOOG'
    else:
        page = requests.get(
            'https://www.marketwatch.com/tools/quotes/lookup.asp?siteID=mktw&Lookup='
            + str(sec) + '&Country=all&Type=All')
        soup = BeautifulSoup(page.content, 'html.parser')
        results = soup.find_all(
            'div',
            class_='results')  ## All the results are there in results div
        if len(results) > 0:
            security = results[0].text.strip().replace('\n', ' ').split()[3]

    remove = []
    for i in range(len(securities)):
        sec = basename(
            securities[i], terms, prefix=False, middle=False,
            suffix=True).lower(
            )  #This is to remove thins like Inc., Ltd. fromt he comapny name
        if sec == 'google':
            ## I have hardcoded it here because Google is being reffered by its parent company Alphabet, yet a lot of people still use google.
            securities[i] = 'GOOG'
            continue

        page = requests.get(
            'https://www.marketwatch.com/tools/quotes/lookup.asp?siteID=mktw&Lookup='
            + str(sec) + '&Country=all&Type=All')
        soup = BeautifulSoup(page.content, 'html.parser')
        results = soup.find_all('div', class_='results')
        if len(results) > 0 and any(
                sec in s.lower()
                for s in results[0].text.strip().replace('\n', ' ').split()
        ):  #Second condition check if the company's name is what we want it to be.
            securities[i] = results[0].text.strip().replace('\n',
                                                            ' ').split()[3]
        else:
            remove.append(i)

    correct_securities = []  #Remove companies whose ticker symbol were wrong.
    j = 0
    for i in range(len(securities)):
        if i not in remove:
            correct_securities.append(securities[i])

    return security, correct_securities
Exemple #3
0
 def create_author(authors):
     terms = prepare_terms()
     # Running twice in order to remove multiple endings, i.e Co., Ltd.
     authors = [
         basename(author.lower().strip(),
                  terms,
                  prefix=True,
                  middle=True,
                  suffix=True) for author in authors
     ]
     authors = [
         basename(author, terms, prefix=True, middle=True,
                  suffix=True).partition(' ')[0] for author in authors
     ]
     return authors
Exemple #4
0
def test_multi_type_cleanups(terms):
    expected = "Hello World"
    errmsg = "cleanup of %s failed"
    for testname, variation in multi_cleanup_tests.items():
        result = basename(variation,
                          terms,
                          prefix=True,
                          suffix=True,
                          middle=True)
        assert result == expected, errmsg % testname
Exemple #5
0
def save_data(dataset, data_type, source_type):
    df_dataset = pd.DataFrame()
    i = 0
    try:
        cnx = psycopg2.connect(**pg_config_patents)
        cur = cnx.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
        cur.execute(f"SET SEARCH_PATH = {data_type}")
        cnx.commit()
    except Exception as err:
        logger.error(err)
    for row in dataset:
        if row['name'] is None or len(row['name']) == 0:
            continue
        name = basename(row['name'],
                        terms,
                        prefix=False,
                        middle=False,
                        suffix=True)
        if data_type == 'applications':
            patentdata = find_applications(name, cnx)
        elif data_type == 'grants':
            patentdata = find_grants(name, cnx)
        if len(patentdata) > 0:
            for patent in patentdata:
                if patent['count'] > 0:
                    for gun in row['gans']:
                        df_dataset.at[i, "gans"] = gun
                        df_dataset.at[i, "participant_name"] = row['name']
                        df_dataset.at[i,
                                      "organization"] = patent['organization']
                        df_dataset.at[i, "year"] = patent['year']
                        df_dataset.at[i, "count"] = patent['count']
                        i += 1
    cnx.close()
    cols = ['gans', 'year', 'count']
    df_dataset[cols] = df_dataset[cols].astype(int)
    print(df_dataset)
    dataname = f"{source_type}_{data_type}"
    filename = f"{dataname}.csv"
    df_dataset.to_csv(filename, index=False)
    try:
        engine = create_engine(
            f"postgresql://{DB_USER_IMI}:{DB_PASS_IMI}@{DB_HOST_IMI}:5432/{DB_NAME_IMI}"
        )
        df_dataset.to_sql(dataname,
                          con=engine,
                          schema=DB_SCHEMA_IMI,
                          if_exists='replace',
                          index=False)
    except Exception as err:
        logger.error(err)
        raise
    del df_dataset
Exemple #6
0
def test_preserving_cleanups(terms):
    errmsg = "preserving cleanup of %s failed"
    for testname, (variation, expected) in preserving_cleanup_tests.items():
        assert basename(variation, terms) == expected, errmsg % testname
Exemple #7
0
def test_basic_cleanups(terms):
    expected = "Hello World"
    errmsg = "cleanup of %s failed"
    for testname, variation in basic_cleanup_tests.items():
        assert basename(variation, terms) == expected, errmsg % testname
Exemple #8
0
def test_with_unicode_umlauted_name(terms):
    errmsg = "preserving cleanup of %s failed"
    for testname, (variation, expected) in unicode_umlaut_tests.items():
        assert basename(variation, terms,
                        prefix=True) == expected, errmsg % testname
Exemple #9
0
def test_terms_with_accents(terms):
    errmsg = "preserving cleanup of %s failed"
    for testname, (variation, expected) in terms_with_accents_tests.items():
        assert basename(variation, terms,
                        suffix=True) == expected, errmsg % testname