def build_data(): # create target variable body = pd.read_csv("train_bodies.csv") stances = pd.read_csv("train_stances.csv") data = pd.merge(body, stances, how='right', on='Body ID') targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) data['target'] = map(lambda x: targets_dict[x], data['Stance']) data_y = data['target'].values # read features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() ] features = [f for g in generators for f in g.read()] data_x = np.hstack(features) print 'data_x.shape' print data_x.shape print 'data_y.shape' print data_y.shape return data_x, data_y
def build_test_data(): data = pd.read_csv('gdbt_testing_input.csv', encoding = 'cp1252') ''' # create target variable # replace file names when test data is ready body = pd.read_csv("test_bodies_processed.csv") stances = pd.read_csv("test_stances_unlabeled.csv") # needs to contain pair id data = pd.merge(stances, body, how='left', on='Body ID') ''' # read features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() ] features = [f for g in generators for f in g.read("test")] print (len(features)) #return 1 data_x = np.hstack(features) print( data_x[0,:]) print ('test data_x.shape') print (data_x.shape) #print ('test body_ids.shape') #print (data['Body ID'].values.shape) # pair id return data_x#, data['Body ID'].values
def build_test_data(): # create target variable # replace file names when test data is ready body = pd.read_csv("test_bodies.csv") stances = pd.read_csv("test_stances_unlabeled.csv") # needs to contain pair id data = pd.merge(stances, body, how='left', on='Body ID') # read features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() ] features = [f for g in generators for f in g.read("test")] print len(features) #return 1 data_x = np.hstack(features) print data_x[0,:] print 'test data_x.shape' print data_x.shape print 'test body_ids.shape' print data['Body ID'].values.shape # pair id return data_x, data['Body ID'].values
def build_test_data(): # create target variable # replace file names when test data is ready body = pd.read_csv("train_bodies_processed.csv") # needs to contain pair id stances = pd.read_csv("../ensemble_learning/subtrain{}/test.csv".format( sys.argv[1])) data = pd.merge(stances, body, how='left', on='Body ID') # read features generators = [ CountFeatureGenerator(), # TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() ] features = [f for g in generators for f in g.read("test")] print len(features) # return 1 data_x = np.hstack(features) print data_x[0, :] print 'test data_x.shape' print data_x.shape print 'test body_ids.shape' print data['Body ID'].values.shape # pair id return data_x, data['Body ID'].values
def build_data(): data = pd.read_csv('./data/merged_data_train.csv', encoding='utf-8') used_column = [ 'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance', 'articleId' ] data = data[used_column].dropna() data['Headline'] = data['claimHeadline'].apply(lambda x: x[8:]) data['articleBody'] = data['articleHeadline'] data['Body ID'] = data['articleId'] #targets = ['observing', 'for', 'against', 'ignoring'] targets = ['unknown', 'false', 'true'] targets_dict = dict(zip(targets, range(len(targets)))) data['target'] = list( map(lambda x: targets_dict[x], data['claimTruthiness'])) #data['target'] = data['target'].astype(int) train = data.sample(frac=0.6, random_state=2018) test = data.loc[~data.index.isin(train.index)] data_y = train['target'].values # generate features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() #AlignmentFeatureGenerator() ] features = [f for g in generators for f in g.read('train')] # print data shape data_x = (np.hstack(features)) print(data_x[0, :]) print('data_x.shape') print(data_x.shape) print('data_y.shape') print(data_y.shape) print('body_ids.shape') print(data['Body ID'].values.shape) return data_x, data_y, data['Body ID'].values, test[[ 'target', 'Headline', 'Body ID' ]]
def build_data(): # create target variable train = pd.read_csv('gdbt_training_input.csv', encoding = 'cp1252') targets = ['fake', 'real'] targets_dict = dict(zip(targets, range(len(targets)))) #train['target'] = list(map(lambda x: targets_dict[x], train['Stance'])) #train['target'] = list(train['target']) ''' body = pd.read_csv("train_bodies_processed.csv") stances = pd.read_csv("train_stances_processed.csv") data = pd.merge(stances, body, how='left', on='Body ID') targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) data['target'] = list(map(lambda x: targets_dict[x], data['Stance'])) data.head() ''' data_y = train['Stance'].values # read features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() #AlignmentFeatureGenerator() ] features = [f for g in generators for f in g.read('train')] data_x = np.hstack(features) print (data_x[0,:]) print ('data_x.shape') print (data_x.shape) print ('data_y.shape') print (data_y.shape) #print ('body_ids.shape') #print (data['Body ID'].values.shape) #with open('data_new.pkl', 'wb') as outfile: # pickle.dump(data_x, outfile, -1) # print 'data saved in data_new.pkl' return data_x, data_y#, data['Body ID'].values
def build_data(): # create target variable body = pd.read_csv("train_bodies_processed.csv") stances = pd.read_csv("train_stances_processed.csv") data = pd.merge(stances, body, how='left', on='Body ID') # targets = ['agree', 'disagree', 'discuss', 'unrelated'] # targets_dict = dict(zip(targets, range(len(targets)))) targets_dict = {'agree': 1, 'disagree': 1, 'discuss': 1, 'unrelated': 0} data['target'] = map(lambda x: targets_dict[x], data['Stance']) data_y = data['target'].values # read features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), NERFeatureGenerator() #SentimentFeatureGenerator() #AlignmentFeatureGenerator() ] features = [f for g in generators for f in g.read('train')] feature_boundary['TfidfFeature'] = 42 feature_boundary['SvdFeature'] = 143 feature_boundary['Word2VecFeature'] = 744 feature_boundary['EntFeature'] = 748 print[i.shape for i in features] data_x = np.hstack(features) # exit() print data_x[0, :] print 'data_x.shape' print data_x.shape print 'data_y.shape' print data_y.shape print 'body_ids.shape' print data['Body ID'].values.shape #with open('data_new.pkl', 'wb') as outfile: # cPickle.dump(data_x, outfile, -1) # print 'data saved in data_new.pkl' return data_x, data_y, data['Body ID'].values
def build_data(): # create target variable body = pd.read_csv("train_bodies.csv") stances = pd.read_csv("train_stances.csv") data = pd.merge(stances, body, how='left', on='Body ID') targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) data['target'] = list(map(lambda x: targets_dict[x], data['Stance'])) data_y = data['target'].values # read features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() #AlignmentFeatureGenerator() ] features = [f for g in generators for f in g.read('train')] #for f in features: #features[1] = features[1].toarray() #print (type(f),f.shape) #data_x = np.hstack(features) #data_x = np.column_stack(features) data_x = np.hstack(features) print (data_x[0,:]) print ('data_x.shape') print (data_x.shape) print ('data_y.shape') print (data_y.shape) print ('body_ids.shape') print (data['Body ID'].values.shape) #with open('data_new.pkl', 'wb') as outfile: # cPickle.dump(data_x, outfile, -1) # print ('data saved in data_new.pkl' return data_x, data_y, data['Body ID'].values
def build_test_data(): # create target variable # replace file names when test data is ready data = pd.read_csv('./data/merged_data_train.csv', encoding='utf-8') used_column = [ 'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance', 'articleId' ] data = data[used_column].dropna() data['Headline'] = data['claimHeadline'].apply(lambda x: x[8:]) data['articleBody'] = data['articleHeadline'] data['Body ID'] = data['articleId'] #targets = ['observing', 'for', 'against', 'ignoring'] targets = ['unknown', 'false', 'true'] targets_dict = dict(zip(targets, range(len(targets)))) data['target'] = list( map(lambda x: targets_dict[x], data['claimTruthiness'])) train = data.sample(frac=0.6, random_state=2018) test = data.loc[~data.index.isin(train.index)] # generate features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() ] features = [f for g in generators for f in g.read("test")] print(len(features)) data_x = np.hstack(features) print(data_x[0, :]) print('test data_x.shape') print(data_x.shape) print('test body_ids.shape') print(test['Body ID'].values.shape) # pair id return data_x, test['Body ID'].values, test['target']
def build_data(): # create target variable body = pd.read_csv("train_bodies_processed.csv") stances = pd.read_csv("../ensemble_learning/subtrain{}/train.csv".format( sys.argv[1])) data = pd.merge(stances, body, how='left', on='Body ID') targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) data['target'] = map(lambda x: targets_dict[x], data['Stance']) data_y = data['target'].values # read features generators = [ CountFeatureGenerator(), TfidfFeatureGenerator(), SvdFeatureGenerator(), Word2VecFeatureGenerator(), SentimentFeatureGenerator() # AlignmentFeatureGenerator() ] features = [f for g in generators for f in g.read('train')] data_x = np.hstack(features) print data_x[0, :] print 'data_x.shape' print data_x.shape print 'data_y.shape' print data_y.shape print 'body_ids.shape' print data['Body ID'].values.shape # with open('data_new.pkl', 'wb') as outfile: # cPickle.dump(data_x, outfile, -1) # print 'data saved in data_new.pkl' return data_x, data_y, data['Body ID'].values
def process(): read = False if not read: body_train = pd.read_csv("train_bodies_processed.csv", encoding='utf-8') stances_train = pd.read_csv("train_stances_processed.csv", encoding='utf-8') # training set train = pd.merge(stances_train, body_train, how='left', on='Body ID') targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) train['target'] = map(lambda x: targets_dict[x], train['Stance']) print 'train.shape:' print train.shape n_train = train.shape[0] data = train # read test set, no 'Stance' column in test set -> target = NULL # concatenate training and test set test_flag = True if test_flag: body_test = pd.read_csv("test_bodies_processed.csv", encoding='utf-8') headline_test = pd.read_csv("test_stances_unlabeled.csv", encoding='utf-8') test = pd.merge(headline_test, body_test, how="left", on="Body ID") data = pd.concat((train, test)) # target = NaN for test set print data print 'data.shape:' print data.shape train = data[~data['target'].isnull()] print train print 'train.shape:' print train.shape test = data[data['target'].isnull()] print test print 'test.shape:' print test.shape #data = data.iloc[:100, :] #return 1 print "generate unigram" data["Headline_unigram"] = data["Headline"].map( lambda x: preprocess_data(x)) data["articleBody_unigram"] = data["articleBody"].map( lambda x: preprocess_data(x)) print "generate bigram" join_str = "_" data["Headline_bigram"] = data["Headline_unigram"].map( lambda x: ngram.getBigram(x, join_str)) data["articleBody_bigram"] = data["articleBody_unigram"].map( lambda x: ngram.getBigram(x, join_str)) print "generate trigram" join_str = "_" data["Headline_trigram"] = data["Headline_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) data["articleBody_trigram"] = data["articleBody_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) with open('data.pkl', 'wb') as outfile: cPickle.dump(data, outfile, -1) print 'dataframe saved in data.pkl' else: with open('data.pkl', 'rb') as infile: data = cPickle.load(infile) print 'data loaded' print 'data.shape:' print data.shape #return 1 # define feature generators countFG = CountFeatureGenerator() tfidfFG = TfidfFeatureGenerator() svdFG = SvdFeatureGenerator() word2vecFG = Word2VecFeatureGenerator() sentiFG = SentimentFeatureGenerator() #walignFG = AlignmentFeatureGenerator() generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG] #generators = [svdFG, word2vecFG, sentiFG] #generators = [tfidfFG] #generators = [countFG] #generators = [walignFG] for g in generators: g.process(data) for g in generators: g.read('train') #for g in generators: # g.read('test') print 'done'
def process(): full_data = pd.read_csv('./data/merged_data_tain.csv', encoding='utf-8') used_column = [ 'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance' ] full_data = full_data[used_column] full_data = full_data.dropna() train, test = train_test_split(full_data, test_size=0.33, random_state=1234) read = False if not read: targets = ['observing', 'for', 'against', 'ignoring'] targets_dict = dict(zip(targets, range(len(targets)))) train['target'] = map(lambda x: targets_dict[x], train['articleStance']) print 'train.shape:' print train.shape n_train = train.shape[0] data = train # read test set, no 'Stance' column in test set -> target = NULL # concatenate training and test set test_flag = True if test_flag: data = train print data print 'data.shape:' print data.shape train = data[~data['target'].isnull()] print train print 'train.shape:' print train.shape test = data[data['target'].isnull()] print test print 'test.shape:' print test.shape #data = data.iloc[:100, :] #return 1 print "generate unigram" data["claimHeadline_unigram"] = data["claimHeadline"].map( lambda x: preprocess_data(x)) data["articleHeadline_unigram"] = data["articleHeadline"].map( lambda x: preprocess_data(x)) print "generate bigram" join_str = "_" data["claimHeadline_bigram"] = data["claimHeadline_unigram"].map( lambda x: ngram.getBigram(x, join_str)) data["articleHeadline_bigram"] = data["articleHeadline_unigram"].map( lambda x: ngram.getBigram(x, join_str)) print "generate trigram" join_str = "_" data["claimHeadline_trigram"] = data["claimHeadline_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) data["articleHeadline_trigram"] = data["articleHeadline_bigram"].map( lambda x: ngram.getTrigram(x, join_str)) with open('data.pkl', 'wb') as outfile: cPickle.dump(data, outfile, -1) print 'dataframe saved in data.pkl' else: with open('data.pkl', 'rb') as infile: data = cPickle.load(infile) print 'data loaded' print 'data.shape:' print data.shape #return 1 # define feature generators countFG = CountFeatureGenerator() tfidfFG = TfidfFeatureGenerator() svdFG = SvdFeatureGenerator() word2vecFG = Word2VecFeatureGenerator() sentiFG = SentimentFeatureGenerator() #walignFG = AlignmentFeatureGenerator() generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG] #generators = [svdFG, word2vecFG, sentiFG] #generators = [tfidfFG] #generators = [countFG] #generators = [walignFG] for g in generators: g.process(data) for g in generators: g.read('train') #for g in generators: # g.read('test') print 'done'
def process(): read = False if not read: ''' body_train = pd.read_csv("train_bodies_processed.csv", encoding='utf-8') stances_train = pd.read_csv("train_stances_processed.csv", encoding='utf-8') # training set train = pd.merge(stances_train, body_train, how='left', on='Body ID') train.head() targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) train['target'] = map(lambda x: targets_dict[x], train['Stance']) print ('train.shape:') print (train.shape) n_train = train.shape[0] ''' #sample starts sample_head = "Italy culls birds after five H5N8 avian flu outbreaks in October" sample_body = "ROME (Reuters) - Italy has had five outbreaks of highly pathogenic H5N8 avian flu in farms the central and northern parts of the country since the start of the month and about 880,000 chickens, ducks and turkeys will be culled, officials said on Wednesday.\ The biggest outbreak of the H5N8 virus, which led to the death or killing of millions of birds in an outbreak in western Europe last winter, was at a large egg producing farm in the province of Ferrara.\ The latest outbreak was confirmed on Oct. 6 and about 853,000 hens are due to be culled by Oct. 17, the IZSV zoological institute said.\ Another involved 14,000 turkeys in the province of Brescia, which are due to be culled by Oct. 13.\ A third involved 12,400 broiler chickens at a smaller farm in the province of Vicenza and two others were among a small number of hens, ducks, broilers and turkeys on family farms.\ In those three cases, all the birds have been culled." sample_head_pd = pd.DataFrame([sample_head]) sample_body_pd = pd.DataFrame([sample_body]) sample_data_pd = pd.concat((sample_head_pd, sample_body_pd), axis=1) sample_data_pd.columns = ['Headline', 'articleBody'] sample_data_pd['URLs'] = np.nan sample_data_pd['Stance'] = np.nan #sample ends dataset = pd.read_csv('data.csv') dataset.isnull().sum() dataset = dataset[pd.notnull(dataset['Body'])] dataset.columns = ['URLs', 'Headline', 'articleBody', 'Stance'] X_data = dataset.iloc[:, 1:3] Y_data = dataset.iloc[:, 3] from sklearn.cross_validation import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.25, random_state=0) train = pd.concat([X_train, Y_train], axis=1) train.to_csv('gdbt_training_input.csv', index=False) X_test.to_csv('gdbt_testing_input.csv', index=False) Y_test = pd.DataFrame(Y_test) Y_test.to_csv('gdbt_testing_ouput.csv', index=False) targets = ['Fake', 'Real'] targets_dict = dict(zip(targets, range(len(targets)))) train['target'] = map(lambda x: targets_dict[x], train['Stance']) data = train # read test set, no 'Stance' column in test set -> target = NULL # concatenate training and test set test_flag = True if test_flag: ''' body_test = pd.read_csv("test_bodies_processed.csv", encoding='utf-8') headline_test = pd.read_csv("test_stances_unlabeled.csv", encoding='utf-8') test = pd.merge(headline_test, body_test, how="left", on="Body ID") ''' data = pd.concat((train, X_test)) # target = NaN for test set #print (data) print('data.shape:') print(data.shape) train = data[~data['target'].isnull()] print(train) print('train.shape:') print(train.shape) test = data[data['target'].isnull()] print(test) print('test.shape:') print(test.shape) #data = data.iloc[:100, :] #return 1 print("generate unigram") data["Headline_unigram"] = data["Headline"].map( lambda x: preprocess_data(x)) print(data.head()) data["articleBody_unigram"] = data["articleBody"].map( lambda x: preprocess_data(x)) print("generate bigram") join_str = "_" data["Headline_bigram"] = data["Headline_unigram"].map( lambda x: ngram.getBigram(x, join_str)) data["articleBody_bigram"] = data["articleBody_unigram"].map( lambda x: ngram.getBigram(x, join_str)) print("generate trigram") join_str = "_" data["Headline_trigram"] = data["Headline_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) data["articleBody_trigram"] = data["articleBody_unigram"].map( lambda x: ngram.getTrigram(x, join_str)) with open('data.pkl', 'wb') as outfile: pickle.dump(data, outfile, -1) print('dataframe saved in data.pkl') else: with open('data.pkl', 'rb') as infile: data = pickle.load(infile) print('data loaded') print('data.shape:') print(data.shape) #return 1 # define feature generators countFG = CountFeatureGenerator() tfidfFG = TfidfFeatureGenerator() svdFG = SvdFeatureGenerator() word2vecFG = Word2VecFeatureGenerator() sentiFG = SentimentFeatureGenerator() #walignFG = AlignmentFeatureGenerator() generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG] #generators = [svdFG, word2vecFG, sentiFG] #generators = [tfidfFG] #generators = [countFG] #generators = [walignFG] #countFG.process(data) #countFG.read() #word2vecFG.process(data) #sentiFG.process(data) for g in generators: g.process(data) for g in generators: g.read('train') for g in generators: g.read('test') print('done')