예제 #1
0
def prepareData(data, NUM_OF_ANS, TIME, not_saved=True):

    if not_saved:
        answersUserIds = data['AnswererId'].fillna(0.0).astype(int)
        all_answerers = activeAnswerers(answersUserIds, 0)
        util.saveElemsToCSV(DIR + 'temp/all_answerers.csv',
                            all_answerers,
                            header='')
        act_answerers = activeAnswerers(answersUserIds, NUM_OF_ANS)
        util.saveElemsToCSV(DIR + 'temp/act_answerers.csv',
                            act_answerers,
                            header='')
        resp_users_ans = respAnswerers(data, TIME)
        util.writeDict(DIR + 'temp/resp_users_ans.csv', resp_users_ans)
        # the list of tags for all questions
        quest_tags = tga.tagList(data)
        util.saveElemsToCSV(DIR + 'temp/quest_tags.csv', quest_tags, header='')
    else:
        all_answerers = util.openListFromCSV(DIR + 'temp/all_answerers.csv')
        all_answerers = [int(i) for i in all_answerers]
        act_answerers = util.openListFromCSV(DIR + 'temp/act_answerers.csv')
        act_answerers = [int(i) for i in act_answerers]
        resp_users_ans = util.openDictfromCSV(DIR + 'temp/resp_users_ans.csv')
        resp_users_ans = [int(i) for i in resp_users_ans]
        quest_tags = util.openListFromCSV(DIR + 'temp/quest_tags.csv')
    answerers = [act_answerers, all_answerers, resp_users_ans]
    return answerers, quest_tags
예제 #2
0
def extractTagFeatures(DIR, df_tags):
    # CAUTION: takes long time for large datasets...
    df = extractNumTags(df_tags)
    print 'extracted the feature: NUM_TAGS'
    if not os.path.exists(DIR + 'posts/tags'):
        os.makedirs(DIR + 'posts/tags')
    df.to_csv(DIR + 'posts/tags/num_tags.csv', index=False)
    df = pd.read_csv(DIR + 'posts/quest_stats.csv')
    df_unique = tags.uniqueTags(df)
    df_unique.to_csv(DIR + 'posts/tags/1Tags_occurancy.csv', index=False)
    #print 'df unique', len(df_unique)
    tag_features = tags.tagFeatures(df_tags, df_unique)
    tag_features.to_csv(DIR + 'posts/tags/tag_features.csv', index=False)
    print 'extracted the features: TAG_POPULARITY_AV, NUM_POPTAGS'
    df2, df2_occ = tags.tags(df)
    #print 'df2 occ', len(df2_occ)
    df2.to_csv(DIR + 'posts/tags/two_tags.csv', index=False)
    df2_occ.to_csv(DIR + 'posts/tags/2Tags_occurancy1.csv',index=False)
    #optimization step - to create a table with only unique tags
    df2_unique = tags.uniqueTagsFromTwoDf(df_unique, df2_occ)
    #print 'df2 unique', len(df2_unique)
    df2_unique.to_csv(DIR + 'posts/tags/1Tags_unique_occ.csv', index=False)
    #df_unique = pd.read_csv('/mnt/nb254_data/src/data_SO/tags/1Tags_unique_occ.csv')
    df_occ = tags.specificityCalc(df2_occ, df2_unique)
    df_occ.to_csv(DIR + 'posts/tags/Tags_occurancy.csv', index=False)
    df_occ = pd.read_csv(DIR + 'posts/tags/Tags_occurancy.csv')
    df1 = pd.read_csv(DIR + 'posts/tags/two_tags.csv')
    df_m = tags.matchAtoB(df_occ, df1)
    print 'extracted the feature: TAG_SPECIFICITY'
    df_m.to_csv(DIR + 'posts/tags/tag_specificity.csv',index=False)
예제 #3
0
파일: UserStats.py 프로젝트: Nik0l/UTemPro
def prepareData(data, NUM_OF_ANS, TIME, not_saved=True):

   if not_saved:
       answersUserIds = data['AnswererId'].fillna(0.0).astype(int)
       all_answerers = activeAnswerers(answersUserIds, 0)
       util.saveElemsToCSV(DIR + 'temp/all_answerers.csv', all_answerers, header='')
       act_answerers = activeAnswerers(answersUserIds, NUM_OF_ANS)
       util.saveElemsToCSV(DIR + 'temp/act_answerers.csv', act_answerers, header='')
       resp_users_ans = respAnswerers(data, TIME)
       util.writeDict(DIR + 'temp/resp_users_ans.csv', resp_users_ans)
       # the list of tags for all questions
       quest_tags = tga.tagList(data)
       util.saveElemsToCSV(DIR + 'temp/quest_tags.csv', quest_tags, header='')
   else:
       all_answerers = util.openListFromCSV(DIR + 'temp/all_answerers.csv')
       all_answerers = [int(i) for i in all_answerers]
       act_answerers = util.openListFromCSV(DIR + 'temp/act_answerers.csv')
       act_answerers = [int(i) for i in act_answerers]
       resp_users_ans = util.openDictfromCSV(DIR + 'temp/resp_users_ans.csv')
       resp_users_ans = [int(i) for i in resp_users_ans]
       quest_tags = util.openListFromCSV(DIR + 'temp/quest_tags.csv')
   answerers = [act_answerers, all_answerers, resp_users_ans]
   return answerers, quest_tags