Example #1
0
def main_parser(f):
    """"
       @input file
       @output list of instance objects

       Reads files in the format as labeled_data.csv as a pandas dataframe
       This means that it contains a top row with the words tweets | class,
       so they can be referenced easily.

       Creates instance objects with the full text, the tokenized text and the label
   """

    # Read inputs using pandas
    df = pd.read_csv(f)
    raw_tweets = df.tweet
    labels = df['class'].astype(int)
    instances = []

    # Process tweets and create instances
    for tweet, label in zip(raw_tweets, labels):

        # Raw tweet and label
        i = Instance()
        i.label = label
        i.fulltweet = tweet

        # Get just text
        clean_tweet = preprocess(tweet)
        i.clean_tweet = clean_tweet

        # Tokenize tweet
        #tokenized_tweet = basic_tokenize(clean_tweet)
        stemmed_tweet = tokenize(clean_tweet)
        #i.wordlist = tokenized_tweet
        i.wordlist = stemmed_tweet
        instances.append(i)

    return instances