def workDataFrame(currentData): # create empty data frame for working converted = pd.DataFrame(columns=range(0, len(currentData.columns) + 300)) df = pd.DataFrame(columns=range(0, 300)) # vectors df = df.add_prefix("TE_") # prefix for vectors columns = currentData.columns # naming columns = columns.append(df.columns) # combine converted.columns = columns # set names for index, firm in currentData.iterrows(): # for every single company print("Working on %s", index) logger.info("(%s) Working on: %s", index, firm["Company_Name"]) Trade_Description = firm["Trade_English"] # Extract Trade Description df = pd.DataFrame(columns=range(0, 300)) # Empty Frame for vectors df = df.add_prefix("TE_") # add prefix for word in Trade_Description: # for every word => build vector try: df = df.append(pd.Series(model.wv[word], index=df.columns), ignore_index=True) # append every vector to df except: continue logger.info("(%s) Working on: %s => %s entries", index, firm["Company_Name"], len(df)) converted.loc[index] = currentData.loc[index].append( df.mean()) # calculate mean per company and add to converted return converted # return converted to the pool function
def workDataFrame(currentData): # create empty data frame for working converted = pd.DataFrame(columns=range(0, len(currentData.columns) + 600)) df = pd.DataFrame(columns=range(0, 600)) # vectors df = df.add_prefix("TE_") # prefix for vectors columns = currentData.columns # naming columns = columns.append(df.columns) # combine converted.columns = columns # set names for index, firm in currentData.iterrows(): print("Working on %s", index) logger.info("(%s) Working on: %s", index, firm["Company_Name"]) Trade_Description = firm["Trade_English"] df = pd.DataFrame(columns=range(0, 300)) df = df.add_prefix("TE_") for word in Trade_Description: try: df = df.append( pd.Series(model.wv[word], index=df.columns), ignore_index=True ) except: continue logger.info( "(%s) Working on: %s => %s entries", index, firm["Company_Name"], len(df) ) converted.loc[index] = currentData.loc[index].append(df.mean()) return converted
# %% imports import sys, os import pandas as pd print(sys.path) print(os.getcwd()) # %% Logging Setup try: from Code.dolog import logger from Code.environment import filePath except: try: from dolog import logger from environment import filePath except: sys.exit("Could not import necessary Code blocks.") logger.info("WD is set to " + filePath) dataPath = filePath + "/02_Data/" logger.info("Writing to " + dataPath) # %% Import Data logger.info("Reading Parquet File") data = pd.read_parquet(dataPath + "/data.vectorized.1000.parquet") # %%
import sys # %% Logging Setup try: from Code.dolog import logger from Code.environment import filePath except: try: from dolog import logger from environment import filePath except: sys.exit("Could not import necessary Code blocks.") logger.info("WD is set to " + filePath) dataPath = filePath + "/02_Data/" logger.info("Writing to " + dataPath) # %% Import Data ## Note: Not necessary. Uses sentences.csv from the word2vec model # %% Train Fasttext from gensim.models.fasttext import FastText as FT_gensim from gensim.models.word2vec import LineSentence sentences = LineSentence(dataPath + "sentences.csv") model_gensim = FT_gensim(size=300)
try: from Code.environment import filePath from Code.dolog import logger from Code.Translate.jsTranslate_H1 import translate from Code.Translate.convert import convertBrokenText except: try: from environment import filePath from dolog import logger from Translate.jsTranslate_H1 import translate from Translate.convert import convertBrokenText except: sys.exit("Could not import necessary Code blocks.") logger.info("WD is set to " + filePath) dataPath = filePath + "/02_Data/" logger.info("Writing to " + dataPath) translationEngine = "azure" # select either azure or deepl # %% Already started? If yes, cancel. file = open(filePath + "/started.txt", "r") started = file.read() file.close() if started == "started": logger.critical("Already executing") exit() # %% Convert Stata file to parquet # if False: