def main_parser(f): """" @input file @output list of instance objects Reads files in the format as labeled_data.csv as a pandas dataframe This means that it contains a top row with the words tweets | class, so they can be referenced easily. Creates instance objects with the full text, the tokenized text and the label """ # Read inputs using pandas df = pd.read_csv(f) raw_tweets = df.tweet labels = df['class'].astype(int) instances = [] # Process tweets and create instances for tweet, label in zip(raw_tweets, labels): # Raw tweet and label i = Instance() i.label = label i.fulltweet = tweet # Get just text clean_tweet = preprocess(tweet) i.clean_tweet = clean_tweet # Tokenize tweet #tokenized_tweet = basic_tokenize(clean_tweet) stemmed_tweet = tokenize(clean_tweet) #i.wordlist = tokenized_tweet i.wordlist = stemmed_tweet instances.append(i) return instances