Beispiel #1
0
def build_data():

    # create target variable
    body = pd.read_csv("train_bodies.csv")
    stances = pd.read_csv("train_stances.csv")
    data = pd.merge(body, stances, how='right', on='Body ID')
    targets = ['agree', 'disagree', 'discuss', 'unrelated']
    targets_dict = dict(zip(targets, range(len(targets))))
    data['target'] = map(lambda x: targets_dict[x], data['Stance'])

    data_y = data['target'].values

    # read features
    generators = [
                  CountFeatureGenerator(),
                  TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                 ]

    features = [f for g in generators for f in g.read()]

    data_x = np.hstack(features)

    print 'data_x.shape'
    print data_x.shape
    print 'data_y.shape'
    print data_y.shape

    return data_x, data_y
Beispiel #2
0
def build_test_data():
    

    
    data = pd.read_csv('gdbt_testing_input.csv', encoding = 'cp1252')
    
    '''
    
    # create target variable
    # replace file names when test data is ready
    body = pd.read_csv("test_bodies_processed.csv")
    stances = pd.read_csv("test_stances_unlabeled.csv") # needs to contain pair id
    data = pd.merge(stances, body, how='left', on='Body ID')
    '''
    # read features
    generators = [
                  CountFeatureGenerator(),
                  TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                 ]

    features = [f for g in generators for f in g.read("test")]
    print (len(features))
    #return 1

    data_x = np.hstack(features)
    print( data_x[0,:])
    print ('test data_x.shape')
    print (data_x.shape)
    #print ('test body_ids.shape')
    #print (data['Body ID'].values.shape)
                   # pair id
    return data_x#, data['Body ID'].values
Beispiel #3
0
def build_test_data():
    
    # create target variable
    # replace file names when test data is ready
    body = pd.read_csv("test_bodies.csv")
    stances = pd.read_csv("test_stances_unlabeled.csv") # needs to contain pair id
    data = pd.merge(stances, body, how='left', on='Body ID')
    
    # read features
    generators = [
                  CountFeatureGenerator(),
                  TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                 ]

    features = [f for g in generators for f in g.read("test")]
    print len(features)
    #return 1

    data_x = np.hstack(features)
    print data_x[0,:]
    print 'test data_x.shape'
    print data_x.shape
    print 'test body_ids.shape'
    print data['Body ID'].values.shape
                   # pair id
    return data_x, data['Body ID'].values
Beispiel #4
0
def build_test_data():

    # create target variable
    # replace file names when test data is ready
    body = pd.read_csv("train_bodies_processed.csv")
    # needs to contain pair id
    stances = pd.read_csv("../ensemble_learning/subtrain{}/test.csv".format(
        sys.argv[1]))
    data = pd.merge(stances, body, how='left', on='Body ID')

    # read features
    generators = [
        CountFeatureGenerator(),
        # TfidfFeatureGenerator(),
        SvdFeatureGenerator(),
        Word2VecFeatureGenerator(),
        SentimentFeatureGenerator()
    ]

    features = [f for g in generators for f in g.read("test")]
    print len(features)
    # return 1

    data_x = np.hstack(features)
    print data_x[0, :]
    print 'test data_x.shape'
    print data_x.shape
    print 'test body_ids.shape'
    print data['Body ID'].values.shape
    # pair id
    return data_x, data['Body ID'].values
def build_data():

    data = pd.read_csv('./data/merged_data_train.csv', encoding='utf-8')
    used_column = [
        'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance',
        'articleId'
    ]

    data = data[used_column].dropna()
    data['Headline'] = data['claimHeadline'].apply(lambda x: x[8:])
    data['articleBody'] = data['articleHeadline']
    data['Body ID'] = data['articleId']
    #targets = ['observing', 'for', 'against', 'ignoring']
    targets = ['unknown', 'false', 'true']
    targets_dict = dict(zip(targets, range(len(targets))))
    data['target'] = list(
        map(lambda x: targets_dict[x], data['claimTruthiness']))
    #data['target'] = data['target'].astype(int)

    train = data.sample(frac=0.6, random_state=2018)
    test = data.loc[~data.index.isin(train.index)]

    data_y = train['target'].values

    # generate features
    generators = [
        CountFeatureGenerator(),
        TfidfFeatureGenerator(),
        SvdFeatureGenerator(),
        Word2VecFeatureGenerator(),
        SentimentFeatureGenerator()
        #AlignmentFeatureGenerator()
    ]
    features = [f for g in generators for f in g.read('train')]

    # print data shape
    data_x = (np.hstack(features))
    print(data_x[0, :])
    print('data_x.shape')
    print(data_x.shape)
    print('data_y.shape')
    print(data_y.shape)
    print('body_ids.shape')
    print(data['Body ID'].values.shape)

    return data_x, data_y, data['Body ID'].values, test[[
        'target', 'Headline', 'Body ID'
    ]]
Beispiel #6
0
def build_data():
    
    # create target variable
    
    train = pd.read_csv('gdbt_training_input.csv', encoding = 'cp1252')
    targets = ['fake', 'real']
    targets_dict = dict(zip(targets, range(len(targets))))
    #train['target'] = list(map(lambda x: targets_dict[x], train['Stance']))
    #train['target'] = list(train['target'])
    '''
    
    body = pd.read_csv("train_bodies_processed.csv")
    stances = pd.read_csv("train_stances_processed.csv")
    data = pd.merge(stances, body, how='left', on='Body ID')
    targets = ['agree', 'disagree', 'discuss', 'unrelated']
    targets_dict = dict(zip(targets, range(len(targets))))
    data['target'] = list(map(lambda x: targets_dict[x], data['Stance']))
    data.head()
    '''
    data_y = train['Stance'].values

    # read features
    generators = [
                  CountFeatureGenerator(),
                  TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]

    features = [f for g in generators for f in g.read('train')]


    data_x = np.hstack(features)
    print (data_x[0,:])
    print ('data_x.shape')
    print (data_x.shape)
    print ('data_y.shape')
    print (data_y.shape)
    #print ('body_ids.shape')
    #print (data['Body ID'].values.shape)
    
    #with open('data_new.pkl', 'wb') as outfile:
    #    pickle.dump(data_x, outfile, -1)
    #    print 'data saved in data_new.pkl'

    return data_x, data_y#, data['Body ID'].values
def build_data():

    # create target variable
    body = pd.read_csv("train_bodies_processed.csv")
    stances = pd.read_csv("train_stances_processed.csv")
    data = pd.merge(stances, body, how='left', on='Body ID')
    #    targets = ['agree', 'disagree', 'discuss', 'unrelated']
    #    targets_dict = dict(zip(targets, range(len(targets))))
    targets_dict = {'agree': 1, 'disagree': 1, 'discuss': 1, 'unrelated': 0}
    data['target'] = map(lambda x: targets_dict[x], data['Stance'])

    data_y = data['target'].values

    # read features
    generators = [
        CountFeatureGenerator(),
        TfidfFeatureGenerator(),
        SvdFeatureGenerator(),
        Word2VecFeatureGenerator(),
        NERFeatureGenerator()
        #SentimentFeatureGenerator()
        #AlignmentFeatureGenerator()
    ]

    features = [f for g in generators for f in g.read('train')]
    feature_boundary['TfidfFeature'] = 42
    feature_boundary['SvdFeature'] = 143
    feature_boundary['Word2VecFeature'] = 744
    feature_boundary['EntFeature'] = 748

    print[i.shape for i in features]
    data_x = np.hstack(features)
    # exit()
    print data_x[0, :]
    print 'data_x.shape'
    print data_x.shape
    print 'data_y.shape'
    print data_y.shape
    print 'body_ids.shape'
    print data['Body ID'].values.shape

    #with open('data_new.pkl', 'wb') as outfile:
    #    cPickle.dump(data_x, outfile, -1)
    #    print 'data saved in data_new.pkl'

    return data_x, data_y, data['Body ID'].values
def build_data():
    
    # create target variable
    body = pd.read_csv("train_bodies.csv")
    stances = pd.read_csv("train_stances.csv")
    data = pd.merge(stances, body, how='left', on='Body ID')
    targets = ['agree', 'disagree', 'discuss', 'unrelated']
    targets_dict = dict(zip(targets, range(len(targets))))
    data['target'] = list(map(lambda x: targets_dict[x], data['Stance']))
    
    data_y = data['target'].values

    # read features
    generators = [
                  CountFeatureGenerator(),
                  TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]

    features = [f for g in generators for f in g.read('train')]
    #for f in features:

        #features[1] =  features[1].toarray()
        #print (type(f),f.shape)


    #data_x = np.hstack(features)
    #data_x = np.column_stack(features)
    data_x = np.hstack(features)
    print (data_x[0,:])
    print ('data_x.shape')
    print (data_x.shape)
    print ('data_y.shape')
    print (data_y.shape)
    print ('body_ids.shape')
    print (data['Body ID'].values.shape)
    
    #with open('data_new.pkl', 'wb') as outfile:
    #    cPickle.dump(data_x, outfile, -1)
    #    print ('data saved in data_new.pkl'

    return data_x, data_y, data['Body ID'].values
def build_test_data():

    # create target variable
    # replace file names when test data is ready
    data = pd.read_csv('./data/merged_data_train.csv', encoding='utf-8')
    used_column = [
        'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance',
        'articleId'
    ]

    data = data[used_column].dropna()
    data['Headline'] = data['claimHeadline'].apply(lambda x: x[8:])
    data['articleBody'] = data['articleHeadline']
    data['Body ID'] = data['articleId']
    #targets = ['observing', 'for', 'against', 'ignoring']
    targets = ['unknown', 'false', 'true']
    targets_dict = dict(zip(targets, range(len(targets))))
    data['target'] = list(
        map(lambda x: targets_dict[x], data['claimTruthiness']))

    train = data.sample(frac=0.6, random_state=2018)
    test = data.loc[~data.index.isin(train.index)]

    # generate features
    generators = [
        CountFeatureGenerator(),
        TfidfFeatureGenerator(),
        SvdFeatureGenerator(),
        Word2VecFeatureGenerator(),
        SentimentFeatureGenerator()
    ]

    features = [f for g in generators for f in g.read("test")]
    print(len(features))

    data_x = np.hstack(features)
    print(data_x[0, :])
    print('test data_x.shape')
    print(data_x.shape)
    print('test body_ids.shape')
    print(test['Body ID'].values.shape)
    # pair id
    return data_x, test['Body ID'].values, test['target']
Beispiel #10
0
def build_data():

    # create target variable
    body = pd.read_csv("train_bodies_processed.csv")
    stances = pd.read_csv("../ensemble_learning/subtrain{}/train.csv".format(
        sys.argv[1]))
    data = pd.merge(stances, body, how='left', on='Body ID')
    targets = ['agree', 'disagree', 'discuss', 'unrelated']
    targets_dict = dict(zip(targets, range(len(targets))))
    data['target'] = map(lambda x: targets_dict[x], data['Stance'])

    data_y = data['target'].values

    # read features
    generators = [
        CountFeatureGenerator(),
        TfidfFeatureGenerator(),
        SvdFeatureGenerator(),
        Word2VecFeatureGenerator(),
        SentimentFeatureGenerator()
        # AlignmentFeatureGenerator()
    ]

    features = [f for g in generators for f in g.read('train')]

    data_x = np.hstack(features)
    print data_x[0, :]
    print 'data_x.shape'
    print data_x.shape
    print 'data_y.shape'
    print data_y.shape
    print 'body_ids.shape'
    print data['Body ID'].values.shape

    # with open('data_new.pkl', 'wb') as outfile:
    #    cPickle.dump(data_x, outfile, -1)
    #    print 'data saved in data_new.pkl'

    return data_x, data_y, data['Body ID'].values
Beispiel #11
0
def process():

    read = False
    if not read:

        body_train = pd.read_csv("train_bodies_processed.csv",
                                 encoding='utf-8')
        stances_train = pd.read_csv("train_stances_processed.csv",
                                    encoding='utf-8')
        # training set
        train = pd.merge(stances_train, body_train, how='left', on='Body ID')
        targets = ['agree', 'disagree', 'discuss', 'unrelated']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])
        print 'train.shape:'
        print train.shape
        n_train = train.shape[0]

        data = train
        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True
        if test_flag:
            body_test = pd.read_csv("test_bodies_processed.csv",
                                    encoding='utf-8')
            headline_test = pd.read_csv("test_stances_unlabeled.csv",
                                        encoding='utf-8')
            test = pd.merge(headline_test, body_test, how="left", on="Body ID")

            data = pd.concat((train, test))  # target = NaN for test set
            print data
            print 'data.shape:'
            print data.shape

            train = data[~data['target'].isnull()]
            print train
            print 'train.shape:'
            print train.shape

            test = data[data['target'].isnull()]
            print test
            print 'test.shape:'
            print test.shape

        #data = data.iloc[:100, :]

        #return 1

        print "generate unigram"
        data["Headline_unigram"] = data["Headline"].map(
            lambda x: preprocess_data(x))
        data["articleBody_unigram"] = data["articleBody"].map(
            lambda x: preprocess_data(x))

        print "generate bigram"
        join_str = "_"
        data["Headline_bigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleBody_bigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print "generate trigram"
        join_str = "_"
        data["Headline_trigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleBody_trigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            cPickle.dump(data, outfile, -1)
            print 'dataframe saved in data.pkl'

    else:
        with open('data.pkl', 'rb') as infile:
            data = cPickle.load(infile)
            print 'data loaded'
            print 'data.shape:'
            print data.shape
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    #for g in generators:
    #    g.read('test')

    print 'done'
Beispiel #12
0
def process():

    full_data = pd.read_csv('./data/merged_data_tain.csv', encoding='utf-8')
    used_column = [
        'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance'
    ]
    full_data = full_data[used_column]
    full_data = full_data.dropna()
    train, test = train_test_split(full_data,
                                   test_size=0.33,
                                   random_state=1234)

    read = False
    if not read:

        targets = ['observing', 'for', 'against', 'ignoring']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x],
                              train['articleStance'])
        print 'train.shape:'
        print train.shape
        n_train = train.shape[0]

        data = train
        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True
        if test_flag:

            data = train
            print data
            print 'data.shape:'
            print data.shape

            train = data[~data['target'].isnull()]
            print train
            print 'train.shape:'
            print train.shape

            test = data[data['target'].isnull()]
            print test
            print 'test.shape:'
            print test.shape

        #data = data.iloc[:100, :]

        #return 1

        print "generate unigram"
        data["claimHeadline_unigram"] = data["claimHeadline"].map(
            lambda x: preprocess_data(x))
        data["articleHeadline_unigram"] = data["articleHeadline"].map(
            lambda x: preprocess_data(x))

        print "generate bigram"
        join_str = "_"
        data["claimHeadline_bigram"] = data["claimHeadline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleHeadline_bigram"] = data["articleHeadline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print "generate trigram"
        join_str = "_"
        data["claimHeadline_trigram"] = data["claimHeadline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleHeadline_trigram"] = data["articleHeadline_bigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            cPickle.dump(data, outfile, -1)
            print 'dataframe saved in data.pkl'

    else:
        with open('data.pkl', 'rb') as infile:
            data = cPickle.load(infile)
            print 'data loaded'
            print 'data.shape:'
            print data.shape
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    #for g in generators:
    #    g.read('test')

    print 'done'
def process():

    read = False
    if not read:
        '''
        body_train = pd.read_csv("train_bodies_processed.csv", encoding='utf-8')
        stances_train = pd.read_csv("train_stances_processed.csv", encoding='utf-8')
        # training set
        train = pd.merge(stances_train, body_train, how='left', on='Body ID')
        
        train.head()
        targets = ['agree', 'disagree', 'discuss', 'unrelated']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])
        print ('train.shape:')
        print (train.shape)
        n_train = train.shape[0]
        '''
        #sample starts

        sample_head = "Italy culls birds after five H5N8 avian flu outbreaks in October"
        sample_body = "ROME (Reuters) - Italy has had five outbreaks of highly pathogenic H5N8 avian flu in farms the central and northern parts of the country since the start of the month and about 880,000 chickens, ducks and turkeys will be culled, officials said on Wednesday.\
            The biggest outbreak of the H5N8 virus, which led to the death or killing of millions of birds in an outbreak in western Europe last winter, was at a large egg producing farm in the province of Ferrara.\
            The latest outbreak was confirmed on Oct. 6 and about 853,000 hens are due to be culled by Oct. 17, the IZSV zoological institute said.\
            Another involved 14,000 turkeys in the province of Brescia, which are due to be culled by Oct. 13.\
            A third involved 12,400 broiler chickens at a smaller farm in the province of Vicenza and two others were among a small number of hens, ducks, broilers and turkeys on family farms.\
            In those three cases, all the birds have been culled."

        sample_head_pd = pd.DataFrame([sample_head])
        sample_body_pd = pd.DataFrame([sample_body])
        sample_data_pd = pd.concat((sample_head_pd, sample_body_pd), axis=1)
        sample_data_pd.columns = ['Headline', 'articleBody']
        sample_data_pd['URLs'] = np.nan
        sample_data_pd['Stance'] = np.nan

        #sample ends

        dataset = pd.read_csv('data.csv')

        dataset.isnull().sum()

        dataset = dataset[pd.notnull(dataset['Body'])]

        dataset.columns = ['URLs', 'Headline', 'articleBody', 'Stance']

        X_data = dataset.iloc[:, 1:3]
        Y_data = dataset.iloc[:, 3]

        from sklearn.cross_validation import train_test_split

        X_train, X_test, Y_train, Y_test = train_test_split(X_data,
                                                            Y_data,
                                                            test_size=0.25,
                                                            random_state=0)

        train = pd.concat([X_train, Y_train], axis=1)

        train.to_csv('gdbt_training_input.csv', index=False)

        X_test.to_csv('gdbt_testing_input.csv', index=False)
        Y_test = pd.DataFrame(Y_test)
        Y_test.to_csv('gdbt_testing_ouput.csv', index=False)

        targets = ['Fake', 'Real']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])

        data = train

        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True

        if test_flag:
            '''
            body_test = pd.read_csv("test_bodies_processed.csv", encoding='utf-8')
            headline_test = pd.read_csv("test_stances_unlabeled.csv", encoding='utf-8')
            test = pd.merge(headline_test, body_test, how="left", on="Body ID")
            '''
            data = pd.concat((train, X_test))  # target = NaN for test set
            #print (data)
            print('data.shape:')
            print(data.shape)

            train = data[~data['target'].isnull()]
            print(train)
            print('train.shape:')
            print(train.shape)

            test = data[data['target'].isnull()]
            print(test)
            print('test.shape:')
            print(test.shape)

        #data = data.iloc[:100, :]

        #return 1

        print("generate unigram")
        data["Headline_unigram"] = data["Headline"].map(
            lambda x: preprocess_data(x))
        print(data.head())
        data["articleBody_unigram"] = data["articleBody"].map(
            lambda x: preprocess_data(x))

        print("generate bigram")
        join_str = "_"
        data["Headline_bigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleBody_bigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print("generate trigram")
        join_str = "_"
        data["Headline_trigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleBody_trigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            pickle.dump(data, outfile, -1)
            print('dataframe saved in data.pkl')

    else:
        with open('data.pkl', 'rb') as infile:
            data = pickle.load(infile)
            print('data loaded')
            print('data.shape:')
            print(data.shape)
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    #countFG.process(data)
    #countFG.read()

    #word2vecFG.process(data)

    #sentiFG.process(data)

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    for g in generators:
        g.read('test')

    print('done')