def test_double_cleanups(): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in multi_cleanup_tests.items(): result = basename(variation, prefix=True, suffix=True, middle=True) final = basename(result, prefix=True, suffix=True, middle=True) assert final == expected, errmsg % testname
def get_ticker_symbol(security, securities): ''' This function will return ticker symbol for security and securities. Since the training examples does not have them, this function scrapes ticker symbole from marketywatch website. ''' sec = basename( security, terms, prefix=False, middle=False, suffix=True).lower( ) #This is to remove thins like Inc., Ltd. fromt he comapny name if sec == 'google': ## I have hardcoded it here because Google is being reffered by its parent company Alphabet, yet a lot of people still use google. security = 'GOOG' else: page = requests.get( 'https://www.marketwatch.com/tools/quotes/lookup.asp?siteID=mktw&Lookup=' + str(sec) + '&Country=all&Type=All') soup = BeautifulSoup(page.content, 'html.parser') results = soup.find_all( 'div', class_='results') ## All the results are there in results div if len(results) > 0: security = results[0].text.strip().replace('\n', ' ').split()[3] remove = [] for i in range(len(securities)): sec = basename( securities[i], terms, prefix=False, middle=False, suffix=True).lower( ) #This is to remove thins like Inc., Ltd. fromt he comapny name if sec == 'google': ## I have hardcoded it here because Google is being reffered by its parent company Alphabet, yet a lot of people still use google. securities[i] = 'GOOG' continue page = requests.get( 'https://www.marketwatch.com/tools/quotes/lookup.asp?siteID=mktw&Lookup=' + str(sec) + '&Country=all&Type=All') soup = BeautifulSoup(page.content, 'html.parser') results = soup.find_all('div', class_='results') if len(results) > 0 and any( sec in s.lower() for s in results[0].text.strip().replace('\n', ' ').split() ): #Second condition check if the company's name is what we want it to be. securities[i] = results[0].text.strip().replace('\n', ' ').split()[3] else: remove.append(i) correct_securities = [] #Remove companies whose ticker symbol were wrong. j = 0 for i in range(len(securities)): if i not in remove: correct_securities.append(securities[i]) return security, correct_securities
def create_author(authors): terms = prepare_terms() # Running twice in order to remove multiple endings, i.e Co., Ltd. authors = [ basename(author.lower().strip(), terms, prefix=True, middle=True, suffix=True) for author in authors ] authors = [ basename(author, terms, prefix=True, middle=True, suffix=True).partition(' ')[0] for author in authors ] return authors
def test_multi_type_cleanups(terms): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in multi_cleanup_tests.items(): result = basename(variation, terms, prefix=True, suffix=True, middle=True) assert result == expected, errmsg % testname
def save_data(dataset, data_type, source_type): df_dataset = pd.DataFrame() i = 0 try: cnx = psycopg2.connect(**pg_config_patents) cur = cnx.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute(f"SET SEARCH_PATH = {data_type}") cnx.commit() except Exception as err: logger.error(err) for row in dataset: if row['name'] is None or len(row['name']) == 0: continue name = basename(row['name'], terms, prefix=False, middle=False, suffix=True) if data_type == 'applications': patentdata = find_applications(name, cnx) elif data_type == 'grants': patentdata = find_grants(name, cnx) if len(patentdata) > 0: for patent in patentdata: if patent['count'] > 0: for gun in row['gans']: df_dataset.at[i, "gans"] = gun df_dataset.at[i, "participant_name"] = row['name'] df_dataset.at[i, "organization"] = patent['organization'] df_dataset.at[i, "year"] = patent['year'] df_dataset.at[i, "count"] = patent['count'] i += 1 cnx.close() cols = ['gans', 'year', 'count'] df_dataset[cols] = df_dataset[cols].astype(int) print(df_dataset) dataname = f"{source_type}_{data_type}" filename = f"{dataname}.csv" df_dataset.to_csv(filename, index=False) try: engine = create_engine( f"postgresql://{DB_USER_IMI}:{DB_PASS_IMI}@{DB_HOST_IMI}:5432/{DB_NAME_IMI}" ) df_dataset.to_sql(dataname, con=engine, schema=DB_SCHEMA_IMI, if_exists='replace', index=False) except Exception as err: logger.error(err) raise del df_dataset
def test_preserving_cleanups(terms): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in preserving_cleanup_tests.items(): assert basename(variation, terms) == expected, errmsg % testname
def test_basic_cleanups(terms): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in basic_cleanup_tests.items(): assert basename(variation, terms) == expected, errmsg % testname
def test_with_unicode_umlauted_name(terms): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in unicode_umlaut_tests.items(): assert basename(variation, terms, prefix=True) == expected, errmsg % testname
def test_terms_with_accents(terms): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in terms_with_accents_tests.items(): assert basename(variation, terms, suffix=True) == expected, errmsg % testname