コード例 #1
0
def main():
    database, coll = connect("datamad1019", "already_analyzed")
    metadata = glob.glob("../output/*.txt")
    
    tile_un = []
    for e in list(coll.find({}, {"tile": 1, "_id":0})):
        tile_un.append(e["tile"])
    
    images = []
    for m in metadata:
        with open(m) as json_file:
            data = json.load(json_file)
            if data["tile"] not in tile_un:
                images.append(data["name_file"])
    
    poolDetect()
コード例 #2
0
def process_cursor(skip_n, limit_n):
    proc = os.getpid()
    print('Starting num: {0}, limit num: {1} by process id: {2}'.format(
        skip_n, limit_n, proc))

    collection = cm.connect()  #connect to mongodb

    #each process access to certain cIds
    cursor = collection.find({u'cId': {'$in': cIds_list[skip_n:limit_n]}})
    #print(str(skip_n) + " mongodb read " + ts.timestamp())

    reviews = pd.DataFrame()
    for doc in cursor:
        tmp = pd.DataFrame(data=[[doc['cId'], doc['desc']]],
                           columns=['cId', 'desc'])
        reviews = reviews.append(tmp, ignore_index=True)
        #print "cId: " + doc['cId'] + " desc: " + doc['desc']

    ma.analysis(reviews)

    print('Completed num: {0}, process: {1}'.format(skip_n, proc))
コード例 #3
0
        reviews = reviews.append(tmp, ignore_index=True)
        #print "cId: " + doc['cId'] + " desc: " + doc['desc']


    ma.analysis(reviews)
    
    print('Completed num: {0}, process: {1}'.format(skip_n, proc))




if __name__ == '__main__':
    print("start " + ts.timestamp())

    #connect to mongodb for creating cIds_list
    collection = cm.connect()


    #sorting by cIds except naverpay, count1
    pipeline = [{'$group':{'_id':'$cId', 'count':{'$sum':1}}}, {'$sort':{    'count':-1}}]
    cIds = pd.DataFrame(list(collection.aggregate(pipeline)))
    cIds = cIds.drop(cIds.index[0]) #drop naverpay
    cIds = cIds[cIds['count'] != 1] #drop count1
    cIds = cIds.sort_values(by=['_id'])
    cIds_list = cIds['_id'].tolist()
    del cIds    
    
    print ("mongodb access for preparing" + ts.timestamp())
    
    
    n_cores = 32