Python get_dirlistの例、utility.functions.get_dirlist Pythonの例

コード例 #1

0

ファイルを表示

def get_all_dataFrame(data_path):
    """
    根据所有的文件构建一个dataFrame。文件格式为['tweet_id', 'origin_tweet_id', 'from_user','from_user_id','to_user','to_user_id', 'tweet_time', 'origin_tweet_time', 'type']
    :param data_path:存储文件的位置。
    :return:构建之后的dataFrame。
    """
    file_name_list = get_dirlist(path=data_path, key_word_list=['hash_qianlong'])
    dataFrame_list = []
    tweet_id_dict = {}
    index = 0
    for file_name in file_name_list:
        index += 1
        print time.ctime(), str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...'

        data = pd.read_csv(data_path + file_name, header = None)
        data.columns = ['tweet_id', 'origin_tweet_id', 'from_user','from_user_id','to_user','to_user_id', 'tweet_time', 'origin_tweet_time', 'type']
        del data['from_user']
        del data['from_user_id']
        del data['to_user']
        del data['to_user_id']
        data.index = data.tweet_id
        tweet_id_dict[file_name] = list(data.tweet_id)
        dataFrame_list.append(data)
    tweet_dataFrame = pd.concat(dataFrame_list, ignore_index = False)
    tweet_dataFrame.index = tweet_dataFrame.tweet_id
    print tweet_dataFrame
    return tweet_dataFrame

コード例 #2

0

ファイルを表示

ファイル: calculate_lifecycle.py プロジェクト: LiuQL2/twitter

def read_csv(data_path):
    """
    根据（tweet_id, tweet_time, root_tweet_id, root_tweet_time）格式的文件构建dataFrame_dict，其中字典中每一个dataFrame的名称和文件名称相同，一个文件的数据存储在一个dataFrame中。
    :param data_path:文件路径
    :return:构建之后的dataFrame_dict.
    """
    file_name_list = get_dirlist(path = data_path, key_word_list=['hash_qianlong'])
    dataFrame_dict = {}
    index = 0
    for file_name in file_name_list:
        index += 1
        # write_log(log_file_name='calculate_lifecycle.log',log_file_path=os.getcwd().replace('process',''),information=str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...')
        print time.ctime(), str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...'
        data = pd.read_csv(data_path + file_name, header = None)
        data.columns = ['tweet_id', 'tweet_time', 'origin_tweet_id', 'origin_tweet_time']
        data = data[data.origin_tweet_time != 'null']

        data['lifecycle'] = data.tweet_time.apply(time_timestamp) - data.origin_tweet_time.apply(time_timestamp)
        # data =  data[data.lifecycle != 0.0]

        del data['tweet_id']
        data.columns = ['end_time', 'tweet_id', 'start_time', 'lifecycle']

        # print data
        data = data.drop_duplicates()
        dataFrame_dict[file_name] = data
    # write_log(log_file_name='calculate_lifecycle.log', log_file_path=os.getcwd(),information='tweet_dataFrame has been built, total number:')

    return dataFrame_dict

コード例 #3

0

ファイルを表示

ファイル: find_root_tweet.py プロジェクト: LiuQL2/twitter

def get_all_dataFrame(data_path):
    """
    根据所有的hash之后的文件进行为每一个文件构建一个dataFrame，最后所有的dataFrame放入一个dict中，key名即为文件的名称
    :param data_path: hash文件存储的路径。
    :return:保存dataFrame的dict
    """
    file_name_list = get_dirlist(path=data_path, key_word_list=['hash_qianlong'])
    dataFrame_dict = {}
    index = 0
    for file_name in file_name_list:
        index += 1
        write_log(log_file_name='find_root_tweet.log',log_file_path=os.getcwd(),information=str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...')
        print time.ctime(), str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...'
        data = pd.read_csv(data_path + file_name, header = None)
        data.columns = ['tweet_id', 'origin_tweet_id', 'from_user','from_user_id','to_user','to_user_id', 'tweet_time', 'origin_tweet_time', 'type']
        data = data[data.origin_tweet_time != 'null']
        data = data[data.type != 'mention']
        del data['from_user']
        del data['from_user_id']
        del data['to_user']
        del data['to_user_id']
        data.index = data.tweet_id
        dataFrame_dict[file_name] = data
    write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),information='tweet_dataFrame has been built, total number:')

    return dataFrame_dict

コード例 #4

0

ファイルを表示

ファイル: network_cohesion_community_thread.py プロジェクト: LiuQL2/twitter

def calculate_cohesion_whole_network(file_path, node_file_name, cohesion_type, cohesion_file, edge_file_name=None, edge_file_path=None,
                                     edge_file_key_word_list=None):
    """
    计算网络的聚合度，这里讲将边文件进行hash处理，以便快速查询。对于点文件，因为不同社区的聚合度为0，所以这里按照社区进行。一个一个考虑。
    这里分为整个网络和部分社区。整个网络需要将变文件hash存储。部分社区不需要。
    :param file_path:节点文件所在的路径，如果是针对部分社区的，也是边文件所在的路径
    :param node_file_name:节点文件名称，第一行是列名。无论整个网络还是部分社区，都是统一格式(user_id,community_id, out_degree,in_degree,degree)
    :param cohesion_type:  处理的类型，是针对整个网络（’whole‘），还是针对部分社区（’community‘）。如果是针对的部分社区，需要对应的社区文件。
    :param edge_file_name:如果针对的是部分社区，该文件为边文件。且第一行为列名(source, target, number_of_interaction, weight)
    :param edge_file_path: 如果是针对整个网络，该路径为存储hash边文件的路径。
    :param edge_file_key_word_list: 如果是针对整个网络的话，
    :return:返回计算的结果。
    """

    node_dataFrame = pd.read_csv(file_path + node_file_name, dtype={'user_id': np.str})
    node_dataFrame.index = node_dataFrame.user_id
    if cohesion_type == 'hash':
        edge_file_list = get_dirlist(path=edge_file_path, key_word_list=edge_file_key_word_list)
        print len(edge_file_list)
        print edge_file_list
        # time.sleep(20)
        edge_dataFrame_dict = {}
        for edge_file_name in edge_file_list:
            number = int((edge_file_name.split('hash_')[-1]).split('.')[0])
            edge_dataFrame_dict[number] = pd.read_csv(edge_file_path + edge_file_name,
                                                      dtype={'source': np.str, 'target': np.str})
    else:
        edge_dataFrame_dict = pd.read_csv(file_path + edge_file_name, header=0,
                                          dtype={'source': np.str, 'target': np.str})

    lock = threading.Lock()

    community_id_list = list(set(list(node_dataFrame.community_id)))
    print 'number 0f community:', len(community_id_list)
    # time.sleep(10)
    thread_list = []
    for community_id in community_id_list:
        community_node_dataFrame = node_dataFrame[node_dataFrame.community_id == community_id]
        thread = cohesionThread(edge_dataFrame=edge_dataFrame_dict, node_dataFrame=community_node_dataFrame,
                                community_id=community_id,cohesion_file = cohesion_file,lock=lock)
        thread.start()
        thread_list.append(thread)
        # break

    for thread in thread_list:
        thread.join()

コード例 #5

0

ファイルを表示

ファイル: simplifyData.py プロジェクト: LiuQL2/twitter

def classify_data():
    path_data = raw_input(
        'Please input the path of directory where the FILES NEEDED to be CLASSIFIED are:'
    )
    path_save_to = raw_input(
        'Please input the path of directory where you want the RESULT FILE saves to:'
    )
    file_save_to = open(path_save_to + 'total_data.json', 'wb')
    file_name_list = get_dirlist(path=path_data,
                                 key_word_list=['part-', '.json'])
    index = 0
    for file_name in file_name_list:
        index = index + 1
        print index, file_name, 'is being classifing......'
        read_file(file_name=file_name,
                  path_data=path_data,
                  file_save_to=file_save_to)
    file_save_to.close()

    print 'The result file has been saved to: ', path_save_to + 'total_data.json'

コード例 #6

0

ファイルを表示

def find_user(path_data, path_save_to, file_save_to):
    file_name_list = get_dirlist(path=path_data,
                                 key_word_list=['part-r', '.json', '33b34d49'],
                                 no_key_word_list=['crc'])
    print len(file_name_list)
    time.sleep(40)
    file_save = open(path_save_to + file_save_to, 'wb')
    file_writer = csv.writer(file_save)
    print file_name_list
    file_index = 0
    for file_name in file_name_list:
        file_index = file_index + 1
        file = open(path_data + file_name, 'r')
        write_log(log_file_name='find_verified_user.log',
                  log_file_path=os.getcwd(),
                  information='file index:' + str(file_index) +
                  ' is being processing.')
        for line in file:
            try:
                print len(line)
                row = json.loads(line, object_pairs_hook=OrderedDict)
                actor = [
                    row['actor']['id'], row['actor']['verified'],
                    row['actor']['preferredUsername']
                ]
                file_writer.writerow(actor)
                print 'file index:', file_index, actor
                if row['type'] == 'retweet':
                    origin_actor = [
                        row['originActor']['id'],
                        row['originActor']['verified'],
                        row['originActor']['preferredUsername']
                    ]
                    file_writer.writerow(origin_actor)
                else:
                    pass
            except:
                print file_index, '*' * 100
                pass
        file.close()
    file_save.close()

コード例 #7

0

ファイルを表示

ファイル: community_topic_wordCloud_2.py プロジェクト: LiuQL2/twitter

 def __init__(self,
              topic_words_file_path,
              font_path=None,
              key_word_list=list(),
              no_key_words_list=list(),
              max_topic_number=4):
     """
     用来初始化一个主题词云的实例
     :param topic_words_file_path: 主题top词文件所在的目录。
     :param font_path: 字体路径，默认为空，在windows系统上需要赋值。
     :param key_word_list: 在目录中读取主题词文件的时候文件名所需要的关键词
     :param no_key_words_list:在目录中读取主题词文件的时候文件名不包含的关键词
     :param max_topic_number:一个社区最多画多少个主题的词云。
     :return:Nothing to return。
     """
     self.topic_words_filename_list = get_dirlist(
         topic_words_file_path,
         key_word_list=key_word_list,
         no_key_word_list=no_key_words_list)
     self.topic_words_file_path = topic_words_file_path
     self.community_topics = {}  #用来存储每一个社区的主题，只有但选择画某个社区的图的时候才会进行读取操作
     self.community_file = {
     }  #每一个社区对应的主题词文件，因为一个社区可能由多个文件，所以在进行保存某一个社区对应哪些文件。一遍画图时直接读取。
     self.font_path = font_path  #字体路径，有些文字需要制定字体，如阿拉伯语。
     self.error_community_id_list = []
     self.full_width_community_id_list = []
     self.max_topic_number = max_topic_number
     community_id_list = []
     for file_name in self.topic_words_filename_list:
         community_id = int(file_name.split('-')[1])
         community_id_list.append(community_id)
     self.community_id_list = list(
         set(community_id_list))  #社区id的列表，所有社区的id 都在这里。
     for community_id in community_id_list:
         self.community_file[community_id] = []
     for file_name in self.topic_words_filename_list:
         community_id = int(file_name.split('-')[1])
         self.community_file[community_id].append(file_name)

コード例 #8

0

ファイルを表示

ファイル: filter_dubai_data_class.py プロジェクト: LiuQL2/twitter

    def filter_data(self, origin_file_path, reserved_data_save_to,
                    filtered_data_save_to):
        self.reserved_data_save_to = reserved_data_save_to
        self.filtered_data_save_to = filtered_data_save_to
        self.origin_file_path = origin_file_path
        file_name_list = get_dirlist(
            origin_file_path,
            key_word_list=['part-r', '.json'],
            no_key_word_list=['crc'])  # 获得原始json文件所在目录里面的所有文件名称
        index = 0
        start_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        for file_name in file_name_list:
            index = index + 1
            print index, file_name, 'is being parsing......'
            self.__read_json__(file_name)

        end_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                 time.localtime(time.time()))

        parse_info_file = open(os.getcwd() + '/filter_data_info.log', 'wb')
        parse_info_file.write("start time:" + str(start_time) + '\n')
        parse_info_file.write("end time:" + str(end_time) + '\n')
        parse_info_file.write("total number of files that parsed:" +
                              str(index) + '\n')
        parse_info_file.write("total number of Dubai tweet:" +
                              str(self.dubai_data_number) + '\n')
        parse_info_file.write("total number of No Dubai tweet:" +
                              str(self.no_dubai_data_number) + '\n')
        parse_info_file.close()

        print '================================================================='
        print 'start_time:', start_time
        print 'end_time:', end_time
        print "total number of Dubai tweet:", self.dubai_data_number
        print "total number of No Dubai tweet:", self.no_dubai_data_number
        print '================================================================='

コード例 #9

0

ファイルを表示

def build_tweet_dataFrame_dict(file_path):
    """
    build tweet_dataFrame_list, that a dataFrame contains the data of a file, and all dataFrames put into one list.
    :param file_path: the path of directory that files in.
    :return: the tweet_dataFrame_list, tweet_dataFrame_index_list, the number of Dubai's actors.
    """
    tweet_dataFrame_dict = {}
    file_name_list = get_dirlist(file_path)
    for file_name in file_name_list:
        tweet_dataFrame_dict[file_name] = list()

    dubai_actor_dict = get_dubai_actor_dict(file_name_list=file_name_list,
                                            file_path=file_path)
    index = 0
    for file_name in file_name_list:
        index = index + 1
        print index, ': BUILDING TWEET DATAFRAME according to file:', index, file_name
        write_log(log_file_name='calculate_lifecycle_own_apart.log',
                  log_file_path=os.getcwd(),
                  information=str(index) +
                  ': BUILDING TWEET DATAFRAME according to file: ' +
                  str(file_name))
        tweet_dataFrame = build_tweet_dataFrame(file_name=file_name,
                                                file_path=file_path)
        tweet_dataFrame_dict[file_name] = tweet_dataFrame

    tweet_dataFrame_dict = add_origin_tweet_to_dataFrame(
        file_name_list=file_name_list,
        file_path=file_path,
        tweet_dataFrame_dict=tweet_dataFrame_dict,
        actor_dict=dubai_actor_dict)

    actor_number = 0
    for key in dubai_actor_dict.keys():
        actor_number = actor_number + len(dubai_actor_dict[key])
    return tweet_dataFrame_dict, actor_number

コード例 #10

0

ファイルを表示

ファイル: statistic_community_size.py プロジェクト: LiuQL2/twitter

                                  header=None,
                                  names=names,
                                  dtype={
                                      'user_id': np.str,
                                      'community': np.int32
                                  })
    print 'get it'
    print nodes_dataFrame
    community_list = set(list(nodes_dataFrame.community))
    community_file = open(path_save_to + community_size_file, 'wb')
    writer = csv.writer(community_file)
    writer.writerow(['community_id', 'number_of_user'])
    for community in community_list:
        number = len(nodes_dataFrame[nodes_dataFrame.community == community])
        writer.writerow([community, number])
    community_file.close()


userId_communityId_file_path = '/pegasus/harir/yangjinfeng/commitresult/community2/'
path_save_to = '/pegasus/harir/Qianlong/data/network/community_size/'
userId_communityId_file_list = get_dirlist(userId_communityId_file_path,
                                           key_word_list=['node-com.txt'])
for file in userId_communityId_file_list:
    community_size_file_name = file.replace('.icpm.node-com.txt',
                                            '_communityId_size.csv')
    get_community_nodes(
        userId_communityId_file_path_name=userId_communityId_file_path + file,
        path_save_to=path_save_to,
        community_size_file=community_size_file_name,
        sep=',',
        names=['user_id', 'community'])

コード例 #11

0

ファイルを表示

    #    for words_type in words_type_list:
    #        print path
    #        words_file = path + get_dirlist(path,key_word_list=[words_type])[0]
    #        print words_file
    #        save_to = image_save_to + directory_name + '/' + words_type + '/'
    #        cloud = commmunityTopWordCloud(top_words_path_file=words_file,background_color=background_color,font_path=font_path)
    #        print 'number of communities:', len(cloud.community_id_list)
    #        # time.sleep(5)
    #        cloud.plot_word_cloud(image_save_to=save_to,file_name_key_word=words_type,number_of_community=number_of_community,community_id_list=[],full_width_community=False)
    #        print directory_name, words_type, cloud.full_width_community_id_list

    #for April
    for directory_name in directory_list:
        for words_type in words_type_list:
            path = top_words_path + directory_name + '/' + words_type + '/'
            print path
            words_file = path + get_dirlist(path, key_word_list=['all-langs'
                                                                 ])[0]
            print words_file
            save_to = image_save_to + directory_name + '/'
            cloud = commmunityTopWordCloud(top_words_path_file=words_file,
                                           background_color=background_color,
                                           font_path=font_path)
            print 'number of communities:', len(cloud.community_id_list)
            # time.sleep(5)
            cloud.plot_word_cloud(image_save_to=save_to,
                                  file_name_key_word=words_type,
                                  number_of_community=number_of_community,
                                  community_id_list=[],
                                  full_width_community=False)
            print directory_name, words_type, cloud.full_width_community_id_list

コード例 #12

0

ファイルを表示

ファイル: community_network_class_April.py プロジェクト: LiuQL2/twitter

    cycle_list = ['01_07', '08_14', '15_21', '22_28', '29_30']

    community_file_path = '/pegasus/harir/yangjinfeng/commitresult4/community2/inoutOrder/'
    overlap_user_file_path = '/pegasus/harir/yangjinfeng/commitresult4/community2/'
    network_edge_file_path = '/pegasus/harir/yangjinfeng/commitresult4/network/'
    path_community_node_edge_save_to = '/pegasus/harir/Qianlong/data/April/network/node_edge/'
    id_label_file = network_edge_file_path + 'kloutScore_iDname.txt'
    verified_user_file = network_edge_file_path + 'kloutScore_iDname.txt'

    community_size = 2000
    community_number = 8
    number_of_top_users = 1000
    label_users_number = 20

    for cycle in cycle_list:
        community_user_ordered_file = get_dirlist(
            path=community_file_path, key_word_list=[cycle, 'icpm_ordered'])[0]
        edge_file = get_dirlist(path=network_edge_file_path,
                                key_word_list=[cycle, '-network_weighted'])[0]
        overlap_user_file = get_dirlist(
            path=overlap_user_file_path,
            key_word_list=[cycle, 'icpm.overlap.txt'])[0]
        print community_user_ordered_file
        print edge_file
        print overlap_user_file
        time.sleep(20)

        print community_user_ordered_file + 'is being processing.'
        print '*' * 100
        save_node_file_name = community_user_ordered_file.replace(
            '.icpm_ordered', '') + '_nodes_top_' + str(
                number_of_top_users) + '_contain_verified' + '.csv'

コード例 #13

0

ファイルを表示

def update_tweet(file_path, tweet_dataFrame_dict):
    """
    update the info of each tweet in the dataFrame accroedig to other tweets.
    :param file_path: The path of directory that all files in.
    :param tweet_dataFrame_list: the list containing the tweet-dataFrame
    :param tweet_dataFrame_index_list: the list containing the index of each dataFrame in tweet_dataFrame_list.
    :return: updated tweet_dataFrame_list.
    """
    tweet_dataFrame_index_dict = get_tweet_dataFrame_index_dict(
        tweet_dataFrame_dict)
    file_name_list = get_dirlist(file_path)
    file_index = 0
    for file_name in file_name_list:
        file_index = file_index + 1
        print file_index, 'UPDATING INFO OF TWEET...', file_name, 'is processing......'
        write_log(log_file_name='calculate_lifecycle_own_apart.log',
                  log_file_path=os.getcwd(),
                  information=str(file_index) + ': UPDATING INFO OF TWEET...' +
                  str(file_name) + 'is being processed......')
        data_file = open(file_path + file_name, 'r')
        index = 0
        for line in data_file:
            index += 1
            row = json.loads(line)
            tweet_body = row['tweet']['body']

            # 'reply' type, update info of tweet that the reply reply to.
            if row['type'] == 'reply':
                tweet_id = "00" + row['tweet']['inReplyTo']
                tweet_index = whether_in_dict(
                    str=tweet_id, dictionary=tweet_dataFrame_index_dict)
                if tweet_index != None:
                    temp_time = compare_time(
                        origin_time=tweet_dataFrame_dict[tweet_index].
                        end_time[tweet_id],
                        new_time=row['tweet']['postedTime'])
                    tweet_dataFrame_dict[tweet_index].loc[
                        [tweet_id], ['end_time']] = temp_time
                    tweet_dataFrame_dict[tweet_index].loc[[tweet_id],
                                                          ['reply_count']] += 1
                    # print index, 'PROCESSING TWEET... tweet type:', row[ 'type'], 'inReplyTo in the dataFrame and update "reply_count and end_time', '00' + row['tweet']['inReplyTo']
                else:
                    pass

            # 'tweet' type.
            # the condition that the user retweet someone's tweet and attached his own words: update info of the tweet that be retweeted if it is included in dataFrame.
            # the condition that a user posts a new tweet just contains his own origin content: do nothing.
            elif row[
                    'type'] == 'tweet' and '://twitter.com/' in tweet_body and '/status/' in tweet_body:
                tweet_body_content_list = tweet_body.split('://twitter.com/')
                tweet_id_content = [
                    content.split('/status/')[1]
                    for content in tweet_body_content_list
                    if '/status/' in content
                ][0]
                tweet_id = '00' + tweet_id_content[:18]
                tweet_index = whether_in_dict(
                    str=tweet_id, dictionary=tweet_dataFrame_index_dict)
                if tweet_index != None:
                    temp_time = compare_time(
                        origin_time=tweet_dataFrame_dict[tweet_index].
                        end_time[tweet_id],
                        new_time=row['tweet']['postedTime'])
                    tweet_dataFrame_dict[tweet_index].loc[
                        [tweet_id], ['end_time']] = temp_time
                    tweet_dataFrame_dict[tweet_index].loc[
                        [tweet_id], ['retweet_count']] += 1
                    # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'update "end_time and retweet_count" of tweet:', tweet_id
                else:
                    # print index , 'PROCESSING TWEET... tweet type:', row['type'], 'tweet:', tweet_id,'not in the dataFrame'
                    pass
            # 'retwet' type
            elif row['type'] == 'retweet':
                origin_tweet_id = row['originTweet']['id']
                origin_tweet_index = whether_in_dict(
                    str=origin_tweet_id, dictionary=tweet_dataFrame_index_dict)
                if origin_tweet_index != None:
                    temp_time = compare_time(
                        origin_time=tweet_dataFrame_dict[origin_tweet_index].
                        end_time[origin_tweet_id],
                        new_time=row['tweet']['postedTime'])
                    tweet_dataFrame_dict[origin_tweet_index].loc[
                        [origin_tweet_id], ['end_time']] = temp_time
                    tweet_dataFrame_dict[origin_tweet_index].loc[
                        [origin_tweet_id], ['retweet_count']] += 1
                    # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'originweet in the dataFrame and update "end_time and retweet_count" of tweet:', tweet_id
                else:
                    # print index , 'PROCESSING TWEET... tweet type:', row['type'], 'originTweet not in the dataFrame'
                    pass
                if '://twitter.com/' in tweet_body and '/status/' in tweet_body:
                    tweet_body_content_list = tweet_body.split(
                        '://twitter.com/')
                    tweet_id_content = [
                        content.split('/status/')[1]
                        for content in tweet_body_content_list
                        if '/status/' in content
                    ][0]
                    tweet_id = '00' + tweet_id_content[:18]
                    tweet_index = whether_in_dict(
                        str=tweet_id, dictionary=tweet_dataFrame_index_dict)
                    if tweet_index != None:
                        temp_time = compare_time(
                            origin_time=tweet_dataFrame_dict[tweet_index].
                            end_time[tweet_id],
                            new_time=row['tweet']['postedTime'])
                        tweet_dataFrame_dict[tweet_index].loc[
                            [tweet_id], ['end_time']] = temp_time
                        tweet_dataFrame_dict[tweet_index].loc[
                            [tweet_id], ['retweet_count']] += 1
                        # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'body has twitter url, and updata "end_time and retweet_count" of tweet:', tweet_id
                    else:
                        # print index,  'PROCESSING TWEET... tweet type:', row['type'], 'body has twitter url, but not in the dataFrmae '
                        pass

        data_file.close()
    return tweet_dataFrame_dict

コード例 #14

0

ファイルを表示

def main():
    if len(sys.argv) != 3:
        print "Unknown Option \n usage: python %s file.scel new.txt" % (sys.argv[0])
        exit(1)

    # Specify the param of scel path as a directory, you can place many scel file in this dirctory, the this process will combine the result in one txt file
    if os.path.isdir(sys.argv[1]):
        for fileName in glob.glob(sys.argv[1] + '*.scel'):
            print fileName
            generator = get_word_from_sogou_cell_dict(fileName)
            with open(sys.argv[2], "a") as f:
                store(generator, f)

    else:
        generator = get_word_from_sogou_cell_dict(sys.argv[1])
        with open(sys.argv[2], "w") as f:
            store(generator, f)
            # showtxt(generator)


if __name__ == "__main__":
    # main()
    path = './../resource/dict_scel/'
    file_name_list = get_dirlist(path=path, key_word_list=['.scel'])
    print file_name_list
    for file in file_name_list:
        print type(file)
        print file
        generator = get_word_from_sogou_cell_dict(path + file)
        with open('./../resource/dict_txt/' + file.replace('.scel','.txt'), "w") as f:
            store(generator, f)

コード例 #15

0

ファイルを表示

    # label_users_number = 20
    # save_node_file_name = 'community_nodes.csv'
    # save_edge_file_name = 'community_edges.csv'
    #
    #
    # community_network = CommunityNetwork(community_size=community_size,community_number=community_number)
    # community_network.get_community_nodes(user_community_path_file=userId_communityId_file)
    # community_network.get_community_top_nodes(number_of_top_users=number_of_top_users,community_user_ordered_path_file=community_user_ordered_file,filter_verified_user=True,verified_user_path_file=verified_user_file)
    # community_network.get_community_edges(total_edge_weight_path_file=total_edge_file,sep = ',',wether_hash=False)
    # # community_network.filter_verified_user(verified_user_path_file= verified_user_file)
    # community_network.label_nodes(top_node_size=label_users_number,label_path_file= id_label_file)
    # community_network.community_nodes_dataFrame.to_csv(path_community_node_edge_save_to + save_node_file_name,index = False, header = True, columns = ['id','community_id','label'])
    # community_network.community_edges_dataFrame.to_csv(path_community_node_edge_save_to + save_edge_file_name, index = False, header= True, columns= ['source','target','weight'])

    community_file_path = '/pegasus/harir/yangjinfeng/commitresult/community2/'
    community_user_ordered_file_list = get_dirlist(
        path=community_file_path, key_word_list=['icpm_ordered'])
    print len(community_user_ordered_file_list)
    print community_user_ordered_file_list
    time.sleep(20)
    path_community_node_edge_save_to = '/pegasus/harir/Qianlong/data/network/node_edge/'
    qianlong_network_path = '/pegasus/harir/Qianlong/data/network/'
    community_size = 2000
    community_number = 8
    number_of_top_users = 1000
    label_users_number = 20
    id_label_file = qianlong_network_path + 'user_all_yang.csv'
    verified_user_file = qianlong_network_path + 'user_verified_long.csv'
    total_edge_file = '/pegasus/harir/sunweiwei/weight/total/' + 'total_network_weight'

    for community_user_ordered_file in community_user_ordered_file_list:
        print community_user_ordered_file + 'is being processing.'

コード例 #16

0

ファイルを表示

# !/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
from utility.functions import get_dirlist
import pandas as pd
import time
import numpy as np

file_path = 'D:/node_edge/'
file_name_list = get_dirlist(path=file_path,
                             key_word_list=['nodes', '2016-'],
                             no_key_word_list=['total'])
print len(file_name_list)
print file_name_list
time.sleep(10)
node_dataFrame = pd.DataFrame()
node_dataFrame['id'] = None
node_dataFrame['label'] = None
node_dataFrame['2016-03-23'] = None
node_dataFrame['2016-03-24'] = None
node_dataFrame['2016-03-25'] = None
node_dataFrame['2016-03-26'] = None
node_dataFrame['2016-03-27'] = None
node_dataFrame['2016-03-28'] = None
node_dataFrame['2016-03-29'] = None
node_dataFrame['2016-03-30'] = None
node_dataFrame['2016-03-31'] = None

day_list = [
    '2016-03-23', '2016-03-24', '2016-03-25', '2016-03-26', '2016-03-27',
    '2016-03-28', '2016-03-29', '2016-03-30', '2016-03-31'

コード例 #17

0

ファイルを表示

                    return True
            else:
                return True
        else:
            return True


if __name__ == '__main__':
    origin_data_path = '/pegasus/twitter-p-or-t-uae-201603.json.dxb/'
    save_path = '/pegasus/harir/Qianlong/data/March/'
    # origin_data_path = 'F:/Twitter/April/'
    # save_path = 'F:/Twitter/April/'

    # file_name_list = get_dirlist(origin_data_path,key_word_list=['201604','.json'])
    file_name_list = get_dirlist(
        origin_data_path,
        key_word_list=['f424-4f7c-b21c-33b34d491577', '.json'],
        no_key_word_list=['.crc'])
    # file_name_list = [ 'twitter-p-or-t-uae-201604.json']
    print len(file_name_list)
    print file_name_list
    time.sleep(10)
    total_number = 0
    no_dubai_number = 0
    dubai_strict_number = 0
    dubai_no_strict_number = 0

    for file_name in file_name_list:
        no_dubai_file = 'no_dubai_' + file_name
        dubai_strict_file = 'dubai_strict_' + file_name
        dubai_no_strict_file = 'dubai_no_strict_' + file_name
        filter_data = FilterData(origin_data_file=origin_data_path + file_name)