Ejemplo n.º 1
0
def workDataFrame(currentData):

    # create empty data frame for working
    converted = pd.DataFrame(columns=range(0, len(currentData.columns) + 300))
    df = pd.DataFrame(columns=range(0, 300))  # vectors
    df = df.add_prefix("TE_")  # prefix for vectors
    columns = currentData.columns  # naming
    columns = columns.append(df.columns)  # combine
    converted.columns = columns  # set names

    for index, firm in currentData.iterrows():  # for every single company
        print("Working on %s", index)
        logger.info("(%s) Working on: %s", index, firm["Company_Name"])
        Trade_Description = firm["Trade_English"]  # Extract Trade Description
        df = pd.DataFrame(columns=range(0, 300))  # Empty Frame for vectors
        df = df.add_prefix("TE_")  # add prefix

        for word in Trade_Description:  # for every word => build vector
            try:
                df = df.append(pd.Series(model.wv[word], index=df.columns),
                               ignore_index=True)  # append every vector to df
            except:
                continue
        logger.info("(%s) Working on: %s => %s entries", index,
                    firm["Company_Name"], len(df))
        converted.loc[index] = currentData.loc[index].append(
            df.mean())  # calculate mean per company and add to converted

    return converted  # return converted to the pool function
Ejemplo n.º 2
0
def workDataFrame(currentData):

    # create empty data frame for working
    converted = pd.DataFrame(columns=range(0, len(currentData.columns) + 600))
    df = pd.DataFrame(columns=range(0, 600))  # vectors
    df = df.add_prefix("TE_")  # prefix for vectors
    columns = currentData.columns  # naming
    columns = columns.append(df.columns)  # combine
    converted.columns = columns  # set names

    for index, firm in currentData.iterrows():
        print("Working on %s", index)
        logger.info("(%s) Working on: %s", index, firm["Company_Name"])
        Trade_Description = firm["Trade_English"]
        df = pd.DataFrame(columns=range(0, 300))
        df = df.add_prefix("TE_")

        for word in Trade_Description:
            try:
                df = df.append(
                    pd.Series(model.wv[word], index=df.columns), ignore_index=True
                )
            except:
                continue
        logger.info(
            "(%s) Working on: %s => %s entries", index, firm["Company_Name"], len(df)
        )
        converted.loc[index] = currentData.loc[index].append(df.mean())

    return converted
Ejemplo n.º 3
0
# %% imports

import sys, os
import pandas as pd

print(sys.path)
print(os.getcwd())
# %% Logging Setup

try:
    from Code.dolog import logger
    from Code.environment import filePath
except:
    try:
        from dolog import logger
        from environment import filePath
    except:
        sys.exit("Could not import necessary Code blocks.")

logger.info("WD is set to   " + filePath)
dataPath = filePath + "/02_Data/"
logger.info("Writing to	 " + dataPath)

# %% Import Data

logger.info("Reading Parquet File")
data = pd.read_parquet(dataPath + "/data.vectorized.1000.parquet")

# %%
Ejemplo n.º 4
0
import sys

# %% Logging Setup

try:
    from Code.dolog import logger
    from Code.environment import filePath
except:
    try:
        from dolog import logger
        from environment import filePath
    except:
        sys.exit("Could not import necessary Code blocks.")

logger.info("WD is set to   " + filePath)
dataPath = filePath + "/02_Data/"
logger.info("Writing to	 " + dataPath)

# %% Import Data
## Note: Not necessary. Uses sentences.csv from the word2vec model

# %% Train Fasttext

from gensim.models.fasttext import FastText as FT_gensim
from gensim.models.word2vec import LineSentence

sentences = LineSentence(dataPath + "sentences.csv")

model_gensim = FT_gensim(size=300)
Ejemplo n.º 5
0
try:
    from Code.environment import filePath
    from Code.dolog import logger
    from Code.Translate.jsTranslate_H1 import translate
    from Code.Translate.convert import convertBrokenText
except:
    try:
        from environment import filePath
        from dolog import logger
        from Translate.jsTranslate_H1 import translate
        from Translate.convert import convertBrokenText
    except:
        sys.exit("Could not import necessary Code blocks.")

logger.info("WD is set to   " + filePath)
dataPath = filePath + "/02_Data/"
logger.info("Writing to	 " + dataPath)
translationEngine = "azure"  # select either azure or deepl

# %% Already started? If yes, cancel.
file = open(filePath + "/started.txt", "r")
started = file.read()
file.close()

if started == "started":
    logger.critical("Already executing")
    exit()

# %% Convert Stata file to parquet
# if False: