def get_dataset(path):
    dataset=[]
    for filename in os.listdir(path):
        t = open(path + filename,"r").read() 
        dataset.append([filename, re.sub('[^a-zA-Z]+', ' ', t)])  
    dico = matcher.get_dico(dataset) # dico is a column with the matching scores of the MDAs versus the Finance Dictionary 
    df=pd.DataFrame(dataset)
    df[1] = pd.Series(dico)
    blob = matcher.get_blob(df)                   
    df[2] = pd.Series(blob)
    df.columns = ['Filename','MatchDico','TextBlob']
    return df
def get_dataset(path):
    dataset=[]
    for filename in os.listdir(path):
        if filename.endswith("pos"):                             
            t = open(path + filename,"r").read() 
            dataset.append([re.sub('[^a-zA-Z]+', ' ', t), re.sub(r"(?:_pos)$",'', filename), "pos"])  
        elif filename.endswith("neg"):                          
            t = open(path + filename,"r").read()
            dataset.append([re.sub('[^a-zA-Z]+', ' ', t), re.sub(r"(?:_neg)$",'', filename), "neg"])                  
    return dataset

### Main function

# FEATURE 1 - Match with the McDonald Dictionary
dataset = get_dataset("../mdatest/")
dico = matcher.get_dico(dataset)        # dico is a column with the matching scores of the MDAs versus the Finance Dictionary 
df=pd.DataFrame(dataset)
df[3] = pd.Series(dico)
df.columns = ['MD&A_Text','Filename','Actual','MatchDico'] 

# FEATURE 2 and 3 - Match with the Compustat financial data to get the indices 'delta_sales' and 'delta_at'
compustat = pd.read_csv('compustat_filenames.csv', sep=',')
de = compustat['delta_sale']
dt = compustat['delta_at']
ds = pd.merge(df, compustat, left_on='Filename', right_on='Filename')


# We split the global matrix "result" into a training and a testing set
train, test = validator.split(ds,0.5)

# We fit a Random Forest model 	 (n_estimators default=10, min_samples_leaf default=1)