def prepareData(data, NUM_OF_ANS, TIME, not_saved=True): if not_saved: answersUserIds = data['AnswererId'].fillna(0.0).astype(int) all_answerers = activeAnswerers(answersUserIds, 0) util.saveElemsToCSV(DIR + 'temp/all_answerers.csv', all_answerers, header='') act_answerers = activeAnswerers(answersUserIds, NUM_OF_ANS) util.saveElemsToCSV(DIR + 'temp/act_answerers.csv', act_answerers, header='') resp_users_ans = respAnswerers(data, TIME) util.writeDict(DIR + 'temp/resp_users_ans.csv', resp_users_ans) # the list of tags for all questions quest_tags = tga.tagList(data) util.saveElemsToCSV(DIR + 'temp/quest_tags.csv', quest_tags, header='') else: all_answerers = util.openListFromCSV(DIR + 'temp/all_answerers.csv') all_answerers = [int(i) for i in all_answerers] act_answerers = util.openListFromCSV(DIR + 'temp/act_answerers.csv') act_answerers = [int(i) for i in act_answerers] resp_users_ans = util.openDictfromCSV(DIR + 'temp/resp_users_ans.csv') resp_users_ans = [int(i) for i in resp_users_ans] quest_tags = util.openListFromCSV(DIR + 'temp/quest_tags.csv') answerers = [act_answerers, all_answerers, resp_users_ans] return answerers, quest_tags
def extractTagFeatures(DIR, df_tags): # CAUTION: takes long time for large datasets... df = extractNumTags(df_tags) print 'extracted the feature: NUM_TAGS' if not os.path.exists(DIR + 'posts/tags'): os.makedirs(DIR + 'posts/tags') df.to_csv(DIR + 'posts/tags/num_tags.csv', index=False) df = pd.read_csv(DIR + 'posts/quest_stats.csv') df_unique = tags.uniqueTags(df) df_unique.to_csv(DIR + 'posts/tags/1Tags_occurancy.csv', index=False) #print 'df unique', len(df_unique) tag_features = tags.tagFeatures(df_tags, df_unique) tag_features.to_csv(DIR + 'posts/tags/tag_features.csv', index=False) print 'extracted the features: TAG_POPULARITY_AV, NUM_POPTAGS' df2, df2_occ = tags.tags(df) #print 'df2 occ', len(df2_occ) df2.to_csv(DIR + 'posts/tags/two_tags.csv', index=False) df2_occ.to_csv(DIR + 'posts/tags/2Tags_occurancy1.csv',index=False) #optimization step - to create a table with only unique tags df2_unique = tags.uniqueTagsFromTwoDf(df_unique, df2_occ) #print 'df2 unique', len(df2_unique) df2_unique.to_csv(DIR + 'posts/tags/1Tags_unique_occ.csv', index=False) #df_unique = pd.read_csv('/mnt/nb254_data/src/data_SO/tags/1Tags_unique_occ.csv') df_occ = tags.specificityCalc(df2_occ, df2_unique) df_occ.to_csv(DIR + 'posts/tags/Tags_occurancy.csv', index=False) df_occ = pd.read_csv(DIR + 'posts/tags/Tags_occurancy.csv') df1 = pd.read_csv(DIR + 'posts/tags/two_tags.csv') df_m = tags.matchAtoB(df_occ, df1) print 'extracted the feature: TAG_SPECIFICITY' df_m.to_csv(DIR + 'posts/tags/tag_specificity.csv',index=False)