def main(): database, coll = connect("datamad1019", "already_analyzed") metadata = glob.glob("../output/*.txt") tile_un = [] for e in list(coll.find({}, {"tile": 1, "_id":0})): tile_un.append(e["tile"]) images = [] for m in metadata: with open(m) as json_file: data = json.load(json_file) if data["tile"] not in tile_un: images.append(data["name_file"]) poolDetect()
def process_cursor(skip_n, limit_n): proc = os.getpid() print('Starting num: {0}, limit num: {1} by process id: {2}'.format( skip_n, limit_n, proc)) collection = cm.connect() #connect to mongodb #each process access to certain cIds cursor = collection.find({u'cId': {'$in': cIds_list[skip_n:limit_n]}}) #print(str(skip_n) + " mongodb read " + ts.timestamp()) reviews = pd.DataFrame() for doc in cursor: tmp = pd.DataFrame(data=[[doc['cId'], doc['desc']]], columns=['cId', 'desc']) reviews = reviews.append(tmp, ignore_index=True) #print "cId: " + doc['cId'] + " desc: " + doc['desc'] ma.analysis(reviews) print('Completed num: {0}, process: {1}'.format(skip_n, proc))
reviews = reviews.append(tmp, ignore_index=True) #print "cId: " + doc['cId'] + " desc: " + doc['desc'] ma.analysis(reviews) print('Completed num: {0}, process: {1}'.format(skip_n, proc)) if __name__ == '__main__': print("start " + ts.timestamp()) #connect to mongodb for creating cIds_list collection = cm.connect() #sorting by cIds except naverpay, count1 pipeline = [{'$group':{'_id':'$cId', 'count':{'$sum':1}}}, {'$sort':{ 'count':-1}}] cIds = pd.DataFrame(list(collection.aggregate(pipeline))) cIds = cIds.drop(cIds.index[0]) #drop naverpay cIds = cIds[cIds['count'] != 1] #drop count1 cIds = cIds.sort_values(by=['_id']) cIds_list = cIds['_id'].tolist() del cIds print ("mongodb access for preparing" + ts.timestamp()) n_cores = 32