Esempio n. 1
0
def update_user_id():
    GRAPH_DATA_DIR='/mnt/data1/weibo_graph/'
    id_map_file=open(GRAPH_DATA_DIR+'id_map.txt')
    uids=dict()
    total_count=107628903
    finish_count=0
    #bar=get_progressive_bar(total_count=total_count)
    for line in id_map_file:
        line=line.replace('\n','').split(' ')
        uids[line[0]]=line[1]
        finish_count+=1
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
    #uids=set(uids)
    from pymongo import Connection
    users=Connection().user_profilling.users
    count=0
    finish_count=0
    u=set()
    for user in users.find({},{'uid':True}):
        finish_count+=1
        uid=user['uid']
        u.add(uid)
        try:
            int_id=uids[uid]
        except Exception as e:
            continue
        users.update({'_id':user['_id']},{'$set':{'int_id':int_id}})
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
    uids=set(uids.keys())
    together=uids & u
    print len(together)
    print len(uids)
    print len(u)
Esempio n. 2
0
def age_distribute():
    from small_utils.progress_bar import progress_bar
    from pymongo import Connection
    from collections import Counter
    collection=Connection().jd.test_users
    weibo_collection=Connection().jd.weibo_users
    linked_jd_ids=dict()
    ages=[]
    for line in open('/mnt/data1/adoni/data/linked_uids.data'):
        linked_jd_ids[line[:-1].split(' ')[1]]=line.split(' ')[0]
    bar=progress_bar(collection.count())
    for index,user in enumerate(collection.find()):
        if sum(user['profile']['age'])==0:
            continue
        weibo_id=linked_jd_ids[user['_id']]
        weibo_user=weibo_collection.find_one({'_id':weibo_id})
        if weibo_user==None:
            continue
        age=2015-int(weibo_user['birthday'].split(u'年')[0])
        if age>50 or age<10:
            continue
        ages.append(age)
        if age<30:
            user['profile']['age']=[1,0]
        else:
            user['profile']['age']=[0,1]
        collection.update({'_id':user['_id']},{'$set':{'profile':user['profile']}})
        bar.draw(index)
    s=sum(Counter(ages).values())
    ages=sorted(Counter(ages).items(),key=lambda d:d[0])
    ss=0.
    for age in ages:
        ss+=age[1]
        print age[0],(ss)/s
Esempio n. 3
0
def update_user_id():
    GRAPH_DATA_DIR = '/mnt/data1/weibo_graph/'
    id_map_file = open(GRAPH_DATA_DIR + 'id_map.txt')
    uids = dict()
    total_count = 107628903
    finish_count = 0
    #bar=get_progressive_bar(total_count=total_count)
    for line in id_map_file:
        line = line.replace('\n', '').split(' ')
        uids[line[0]] = line[1]
        finish_count += 1
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
    #uids=set(uids)
    from pymongo import Connection
    users = Connection().user_profilling.users
    count = 0
    finish_count = 0
    u = set()
    for user in users.find({}, {'uid': True}):
        finish_count += 1
        uid = user['uid']
        u.add(uid)
        try:
            int_id = uids[uid]
        except Exception as e:
            continue
        users.update({'_id': user['_id']}, {'$set': {'int_id': int_id}})
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
    uids = set(uids.keys())
    together = uids & u
    print len(together)
    print len(uids)
    print len(u)
Esempio n. 4
0
def update_all():
    tags = Connection()["reddit"]["tags"]
    index = Connection()["reddit"]["inverted_index"]

    invalid = ['.', '$']
    for tag in tags.find():
        for key in tag.keys():
            if key != "_id":
                word_list = tag[key]
                for w in word_list:
                    for i in invalid:
                        if i in w:
                            w = w.replace(i,'')
                    row = index.find_one({"key" : w})
                    if not row:
                        index.insert({"key": w, "ids" : [key]})
                    else:
                        print "Updating", w
                        print row, row["ids"]
                        lst = list(row["ids"])
                        print lst, key
                        lst.append(key)
                        new_row = {"key":w, "ids": lst}
                        print new_row
                        index.update({"key":w}, new_row)
Esempio n. 5
0
def insert_LINE_vector(file_name=RAW_DATA_DIR + 'normalize2.data'):
    vectors = dict()
    fin = open(file_name)
    line = fin.readline().strip().split(' ')
    count, dimention = int(line[0]), int(line[1])
    bar = progress_bar(count)
    for index in xrange(count):
        line = fin.readline()
        line = line.strip().split(' ')
        vector = map(lambda d: float(d), line[1:])
        vectors[line[0]] = vector
        bar.draw(index + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        if user['_id'] not in vectors:
            vectors[user['_id']] = [0.] * dimention
            continue
        collection.update(
            {'_id': user['_id']},
            {'$set': {
                'user_product_vector_from_line': vectors[user['_id']]
            }})
        bar.draw(index + 1)
    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        if user['_id'] not in vectors:
            continue
        collection.update(
            {'_id': user['_id']},
            {'$set': {
                'user_product_vector_from_line': vectors[user['_id']]
            }})
        bar.draw(index + 1)
Esempio n. 6
0
class Food(object):

    def __init__(self):
        self.db = Connection()["food"]["choices"]

    def add(self, name):
        name = str(name).lower()
        self.db.update({'name':name}, {'name':name}, upsert=True)

    def remove(self, name):
        self.db.remove({'name':name})

    def get_all(self):
        retVal = []
        try:
            cur = self.db.find()
            for i in cur:
                retVal.append(i['name'])
        except:
            pass
        return retVal
        

    def choose(self):
        try:
            cur = self.db.find()
            tmp = []
            for i in cur:
                tmp.append(i['name'])
            index = random.randrange(len(tmp))
            return tmp[index]
        except:
            return "unknown"
Esempio n. 7
0
 def f(l_min, l_max):
     from pymongo import Connection
     C = Connection(address).research
     C.authenticate(user, password)
     C = C.ellcurves
     for v in C.find({'level':{'$gte':level_min, '$lt':level_max},
                      'sel2':{'$exists':False}}):
         sel2 = selmer2(eval(v['weq']), max_time)
         C.update({'_id':v['_id']}, {'$set':{'sel2':sel2}})
Esempio n. 8
0
class CountryDB(object):
    def __init__(self, mongo_host):
        self._db = Connection(mongo_host)['crunch']['country']

    def save(self, country):
        self._db.save({'_id':country})

    def increment(self, country):
        self._db.update({'_id':country}, {'$inc': {'c': 1}}, upsert=True)
Esempio n. 9
0
File: ap.py Progetto: merbst/psage
 def f(l_min, l_max):
     from pymongo import Connection
     C = Connection(address).research
     C.authenticate(user, password)
     C = C.ellcurves
     for v in C.find({'level':{'$gte':level_min, '$lt':level_max},
                      'number':1,
                      'ap':{'$exists':False}}):
         E = pari('ellinit(%s,1)'%v['weq'])
         ap = dict([(str(p),int(E.ellap(p))) for p in P])
         C.update({'_id':v['_id']}, {'$set':{'ap':ap}})
Esempio n. 10
0
 def f(l_min, l_max):
     from pymongo import Connection
     C = Connection(address).research
     C.authenticate(user, password)
     C = C.ellcurves
     for v in C.find({'level':{'$gte':level_min, '$lt':level_max},
                      'number':1,
                      'L0s':{'$exists':False}}):
         L = Lfunction_from_elliptic_curve(EllipticCurve(eval(v['weq'])), 10**5)
         z = L.find_zeros_via_N(num_zeros)
         L0s =  dict([(str(i),float(z[i])) for i in range(len(z))])
         C.update({'_id':v['_id']}, {'$set':{'L0s':L0s}})
Esempio n. 11
0
def insert_age_vector():
    from collections import Counter
    users=Connection().jd.weibo_users
    all_vec=[]
    for user in users.find():
        profile=user['profile']
        if user['birthday'] is None:
            age_vec=[0,0]
            profile['age']=age_vec
            users.update({'_id':user['_id']},{'$set':{'profile':profile}})
            continue
        if u'年' not in user['birthday']:
            age_vec=[0,0]
            profile['age']=age_vec
            users.update({'_id':user['_id']},{'$set':{'profile':profile}})
            continue
        age=user['birthday']
        age=age[0:age.find(u'年')]
        if len(age)<4:
            age='19'+age
        age=int(age)
        if age<1950 or age>2010:
            age_vec=[0,0]
            profile['age']=age_vec
            users.update({'_id':user['_id']},{'$set':{'profile':profile}})
            continue
        if age<1987:
            age_vec=[1,0]
        else:
            age_vec=[0,1]
        profile['age']=age_vec
        users.update({'_id':user['_id']},{'$set':{'profile':profile}})
        all_vec.append(str(age_vec))
    print Counter(all_vec)
Esempio n. 12
0
def update_index(id, words):
    index = Connection()["reddit"]["inverted_index"]
    for w in words:
        for i in invalid:
            if i in w:
                w = w.replace(i,'')
        row = index.find_one({"key" : w})
        if not row:
            index.insert({
                "key" : w,
                "ids" : [id]
                })
        else:
            lst = list(row["ids"])
            if id not in lst:
                lst.append(id)
                new_row = {"key":w, "ids":lst}
                index.update({"key" : w}, new_row)
Esempio n. 13
0
 def f(l_min, l_max):
     from pymongo import Connection
     C = Connection(address).research
     C.authenticate(user, password)
     C = C.ellcurves
     for v in C.find({
             'level': {
                 '$gte': level_min,
                 '$lt': level_max
             },
             'number': 1,
             'L0s': {
                 '$exists': False
             }
     }):
         L = Lfunction_from_elliptic_curve(EllipticCurve(eval(v['weq'])),
                                           10**5)
         z = L.find_zeros_via_N(num_zeros)
         L0s = dict([(str(i), float(z[i])) for i in range(len(z))])
         C.update({'_id': v['_id']}, {'$set': {'L0s': L0s}})
Esempio n. 14
0
def init():
    collection=Connection().jd.test_users
    for user in collection.find():
        collection.update({'_id':user['_id']},{'$unset':{'knn_by_mentions':""}})
        collection.update({'_id':user['_id']},{'$unset':{'knn_by_products':""}})
Esempio n. 15
0
def __updateOpenvas(openvas, fieldsToUpdate):
    openvasTask = Connection().phoenorama.openvasTask
    openvasTask.update({'task_uuid': openvas.task_uuid}, {'$set' : fieldsToUpdate})
    return "OpenVAS was successfully updated"
Esempio n. 16
0
class Mongo(object):
    def __init__(self, log, sw=None):

        self.name = self.__class__.__name__
        self.log = log
        self.articles = Connection().aivb_db.articles \
            if not sw else Connection().aivb_redux.dater

    def __str__(self):
        return """
                'all':        None,
                'search':     {k: v},
                'empty':      {k: 0},
                'filled':     {k: {'$gt': 0.5}},
                'gtv':        {k: {'$gt': v}},
                'regex':      {k: {'$regex': v}},
                'exists':     {k: {'$exists': True}},
                'and_ex':     {'$and': [{k: v}, {k2: {'$exists': True}}]},
                'grt_ex':     {'$and': [{k: {'$exists': True}}, {k2: {'$gt': v2}}]},
                'grt_eq':     {'$and': [{k: {'$exists': True}}, {k2: v2}]},
                'p_range':    {'$and': [{k: {'$gte': v}}, {k2: {'$lte': v2}}]},
                'period':     {'$and': [{k: v}, {k2: {'$gt': v2}}]},
                'andand':     {'$and': [{k: v}, {k2: v2}]}
                """

    def load(self, n=None):
        load = Loader(self.log)
        data = load.fetch_data(n)
        [[self.articles.insert(i) for i in x] for x in data]
        self.log.mlog.info("Inserted %d Instances of articles." % n)

    def search(self,
               command,
               key=None,
               value=None,
               s_key=None,
               s_value=None,
               t_key=None):
        if not key:
            res = [self.articles.find_one()]
        else:
            res = self.parse_search(command, key, value, s_key, s_value, t_key)
        return res

    def clear_all(self, v=None):
        for art in self.articles.find():
            if v:
                print art
            self.articles.remove(art)

    def parse_search(self, c, k, v, k2, v2, k3):
        op = {
            'all': None,
            'search': {
                k: v
            },
            'empty': {
                k: 0
            },
            'filled': {
                k: {
                    '$gt': 0.5
                }
            },
            'gtv': {
                k: {
                    '$gt': v
                }
            },
            'regex': {
                k: {
                    '$regex': v
                }
            },
            'exists': {
                k: {
                    '$exists': True
                }
            },
            'and_ex': {
                '$and': [{
                    k: v
                }, {
                    k2: {
                        '$exists': True
                    }
                }]
            },
            'grt_ex': {
                '$and': [{
                    k: {
                        '$exists': True
                    }
                }, {
                    k2: {
                        '$gt': v2
                    }
                }]
            },
            'grt_eq': {
                '$and': [{
                    k: {
                        '$exists': True
                    }
                }, {
                    k2: v2
                }]
            },
            'p_range': {
                '$and': [{
                    k: {
                        '$gte': v
                    }
                }, {
                    k2: {
                        '$lte': v2
                    }
                }]
            },
            'period': {
                '$and': [{
                    k: v
                }, {
                    k2: {
                        '$gt': v2
                    }
                }]
            },
            'andand': {
                '$and': [{
                    k: v
                }, {
                    k2: v2
                }]
            }
        }
        if 'select' not in c:
            return self.articles.find(op[c])
        else:
            if not k3:
                return self.articles.find(op[c.split('_')[1]], {
                    '_id': k2,
                    v2: 1
                })
            else:
                return self.articles.find(op[c.split('_')[1]], {
                    '_id': k2,
                    v2: 1,
                    k3: 1
                })

    def update(self, c, eid, k, v, k2=None):
        op = {'one': {'$set': {k: v}}, 'two': {'$set': {k2: {'$set': {k: v}}}}}
        self.articles.update({'_id': eid}, op[c], upsert=False, multi=False)
Esempio n. 17
0
class Linkage(object):
    def __init__(self):
        self.edit_dist_threshold = 0.2
        self.tweets = Connection().tweetsDB.tweetsCollection
        self.businesses = Connection().yelpDB.yelpBusinesses
        self.photos = Connection().flickrDB.flickrCollection
        self.linked = Connection().linkedDB.linkedCollection
        self.MAX_EDIT_DISTANCE = 100000000
        
    def unescape_html_chars(self, item):
        return item.replace("&amp;", "&").replace("&gt;", ">").replace("&lt;", "<").replace("&quot;", "\"")
    
    def extract_tweet_ne(self, item):
        item = item.split()
        # Split the annotated text on whitespace, collect B/I items and join them together
        # Then return a list of named entities
        result = []
        cur_ne = []
        #print item
        for token in item:
            bio_tag = token.split("/")[-1]
            #print bio_tag
            # If we're starting/in the middle of an NE
            if bio_tag == "B-ENTITY" or bio_tag == "I-ENTITY":
                cur_ne.append(self.unescape_html_chars(token.split("/")[0]))
            # If we've hit an O and we have a previous NE, close it off 
            elif bio_tag == "O" and len(cur_ne) > 0:
                result.append(" ".join(cur_ne))
                cur_ne = []
        # If we've hit the end and ended with an NE
        if len(cur_ne) > 0:
            result.append(" ".join(cur_ne))
        return result
    
    def get_nearby_businesses(self, loc):
        return [_ for _ in self.businesses.find({"loc":{"$maxDistance":2, "$near":loc}}).limit(20)]
    
    def get_nearby_photos(self, loc):
        return [_ for _ in self.photos.find({"loc":{"$maxDistance":2, "$near":loc}}).limit(20)]
    
    def get_nearby_tweets(self, loc):
        return [_ for _ in self.tweets.find({"loc":{"$maxDistance":2, "$near":loc}}).limit(100)]
    
    def extract_normal_ne(self, text):
        result = []
        for sent in sent_tokenize(text) if text else []:
            for chunk in ne_chunk(pos_tag(word_tokenize(sent))):
                if hasattr(chunk, "node"):
                    result.append(" ".join([c[0] for c in chunk.leaves()]))
        return result
    
    def yelp_twitter_linkage(self):
        # Record the number of matches just for information
        num_matches = 0
        for tweet in self.tweets.find():
            # One tweet had to be removed because it contained characters that
            # screwed up the NE and thus lacks the annotated field
            if "annotated" not in tweet:
                continue
            # Extract the named entities for comparison
            named_entities = self.extract_tweet_ne(tweet["annotated"].encode("UTF-8"))
            # Skip if we have no named entities in the tweet
            if not named_entities:
                continue
            #print "Tweet loc: %s" % tweet["loc"]
            #print "NE: %s" % named_entities
            # Examine the 20 closest businesses
            nearby_businesses = self.get_nearby_businesses(tweet["loc"])
            # Skip if there are no nearby businesses
            if not nearby_businesses:
                continue
            # For each NE in the tweet, compare to the 20 closest businesses
            for named_entity in named_entities:
                # Mark the smallest edit distance seen
                min_dist = self.MAX_EDIT_DISTANCE
                best_match = None
                for business in nearby_businesses:
                    # Get the edit distance between the Twitter NE and the business name
                    edit_dist = edit_distance(named_entity, business["name"])
                    # If it's the smallest seen so far, mark it
                    if edit_dist < min_dist:
                        min_dist = edit_dist
                        best_match = business
                # If it's below the threshold(originally 20% of the length of the best match)
                # then consider it a match
                if min_dist <= self.edit_dist_threshold * len(best_match["name"]):
                    #print "Good match for %s is: %s --> edit dist is: %d" % (named_entity, best_match, min_dist)
                    num_matches += 1
                    # Mark it in the linked db
                    self.linked.update({"id": "%s_%s" % (tweet["id"], best_match["id"])},
                                       {"tweetID":tweet["id"], "yelpID":best_match["id"]}, True)
        print "Yelp/Twitter: %d" % num_matches

    def yelp_flickr_linkage(self):
        num_matches = 0
        for item in self.photos.find():
            desc = item["description"].encode("UTF-8") if "description" in item and item["description"] else None
            title = item["title"].encode("UTF-8") if "title" in item and item["title"] else None
            if not desc and not title:
                continue
            named_entities = []
            named_entities.extend(self.extract_normal_ne(desc))
            named_entities.extend(self.extract_normal_ne(title))
            #print "DESC: %s" % desc
            #print "TITLE: %s" % title
            #print "NE: %s" % ne_list
            nearby_businesses = self.get_nearby_businesses(item["loc"])
            if not nearby_businesses:
                continue
            # Similar to the function above, iterate through the named entities in the photos
            # and compare to the Yelp business name. The best one, if within an edit distance
            # limit, is considered a match.
            for named_entity in named_entities:
                #print "NE ITEM: %s" % ne
                #print "NEARBY BUSINESSES: %s" % nearby_businesses
                min_dist = self.MAX_EDIT_DISTANCE
                best_match = None
                for business in nearby_businesses:
                    edit_dist = edit_distance(named_entity, business["name"])
                    if edit_dist < min_dist:
                        min_dist = edit_dist
                        best_match = business
                if min_dist <= self.edit_dist_threshold * len(best_match["name"]):
                    num_matches += 1
                    #print "Best match for %s is: %s" % (named_entity, best_match["name"])
                    self.linked.update({"id":"%s_%s" % (item["id"], best_match["id"])},
                                       {"flickrID":item["id"], "yelpID":best_match["id"]}, True)
        
        print "Yelp/Flickr matches: %d" % num_matches
        
    def flickr_twitter_linkage(self):
        num_matches = 0
            
        for tweet in self.tweets.find():
            if "annotated" not in tweet:
                continue
            
            named_entities = self.extract_tweet_ne(tweet["annotated"].encode("UTF-8"))
            if not named_entities:
                continue
            
            nearby_photos = self.get_nearby_photos(tweet["loc"])
            if not nearby_photos:
                continue
            
            for named_entity in named_entities:
                min_dist = self.MAX_EDIT_DISTANCE
                best_match = None
                
                for photo in nearby_photos:
                    photo_named_entities = []
                    desc = photo["description"].encode("UTF-8") if "description" in photo and photo["description"] else None
                    title = photo["title"].encode("UTF-8") if "title" in photo and photo["title"] else None
                    if not desc and not title:
                        continue
                    photo_named_entities.extend(self.extract_normal_ne(desc))
                    photo_named_entities.extend(self.extract_normal_ne(title))
                    
                    for photo_named_entity in photo_named_entities:
                        edit_dist = edit_distance(named_entity, photo_named_entity)
                        if edit_dist < min_dist:
                            min_dist = edit_dist
                            best_match = photo
                            
                if min_dist < self.MAX_EDIT_DISTANCE and min_dist <= self.edit_dist_threshold * len(best_match):
                    num_matches += 1
                    #print "MATCH for tweet NE %s: flickr %s" % (named_entity, best_match)
                    self.linked.update({"id":"%s_%s" % (tweet["id"], best_match["id"])},
                                           {"tweetID":tweet["id"], "flickrID":best_match["id"]}, True)
                        
        print "Flickr/tweet matches: %s" % num_matches
Esempio n. 18
0
class AutoCompModule:

    # Auto completion module
    # Using the MongoDB server
    # Holds three dictionaries :
    #   dict - holds the amount of x's appearances in the learned text
    #   dictBy2 - holds the amount of (x,y) appearances in the learned text
    #   dictBy2 - holds the amount of (x,y,z) appearances in the learned text
    def __init__(self,DBName):
        self.dict = Connection()[DBName]['dict']
        self.dictBy2 = Connection()[DBName]['dictBy2']
        self.dictBy3 = Connection()[DBName]['dictBy3']
    
    
    # Method to learn from a single file
    # For each file the method detects all the information mentioned above
    # Definitions :
    #   pprev,prev,word are the three last seen words (where word is the current word) 
    def learnSingle(self,fileName):
        input = open(fileName, encoding='utf-8')
        for line in input:
            pprev = prev = None
            for word in line.split():
                if re.match("[.,\"\(\);']",word):
                    pprev = prev = word = None
                    continue
            
                if self.dict.find_one({"word": word,"grade": { "$exists": True}}) != None:
                    self.dict.update({"word": word},{ "$inc": {"grade":1}})
                else:
                    self.dict.insert({"word": word, "grade":1, "info": None})
            
                if prev!=None:
                    if self.dictBy2.find_one({"first": prev,"second": word,"grade": { "$exists": True}}) != None:
                        self.dictBy2.update({"first": prev,"second": word},{ "$inc": {"grade":1}})
                    else:
                        self.dictBy2.insert({"first": prev,"second": word,"grade":1})
                    if pprev!=None:
                        if self.dictBy3.find_one({"first": pprev,"second": prev,"third": word,"grade": { "$exists": True}}) != None:
                                  self.dictBy3.update({"first": pprev,"second": prev,"third": word},{ "$inc": {"grade":1}})
                        else:
                            self.dictBy3.insert({"first": pprev,"second": prev,"third": word,"grade":1})
                    pprev=prev
                prev = word
        input.close()


    # Method to learn from multiple files
    # Uses learnSingle Method
    def learn(self,inputDir):
        size = len(os.listdir(inputDir))
        i=1
        if os.path.isdir(inputDir):
            for f in sorted(os.listdir(inputDir)):
                self.learnSingle(inputDir + '/' + f)
                sys.stdout.flush()
                print(str(int((i*100)/size))+"%",end="\r") 
                i+=1   
            print ("SUCCESS LEARNING FINISH")
        else:
            print ("ERROR!!")


    def addMalletInfoToDB(self, wtcfile, twwfile, keysfile):
        wordDict = malletGetWordsAndData(wtcfile, twwfile, keysfile)
        for word in wordDict:
            if self.dict.find_one({"word": word,"grade": { "$exists": True}}) != None:
                    self.dict.update({"word": word},{"$set":{"info": wordDict[word]}}) #####################################


    # Method that suggests the next word
    # For a given pprev and prev (definitions mentioned above) it finds the most likely word, one time
    # using only prev and the second using both pprev and prev
    # 
    # This method returns both NONE and NOT NONE values
    # None values are returned when there is no match to prev (or pprev and prev) in the dictionaries 
    # or when they are given as NONE
    def suggest(self,pprev=None,prev=None):
        if prev is None:
            return None , None
        if pprev is None:
            a = self.dictBy2.find_one({"first": prev},sort=[("grade",-1),("second",1)])
            if a is not None:
                return a["second"] , None
            else:
                return None, None
        a = self.dictBy2.find_one({"first": prev},sort=[("grade",-1),("second",1)])
        b =  self.dictBy3.find_one({"first": pprev, "second": prev},sort=[("grade",-1),("third",1)])
        if b is not None:
            return a["second"] , b["third"]
        else:
            return None , None
    
    def suggest2(self,pprev=None,prev=None,x=5):
        if prev is None:
            return None , None
        i=0
        lst=[]
        for a in self.dictBy2.find({"first": prev}).sort([('grade',-1),('second',1)]):
            if i<x:
                lst.append(a)
                i+=1
            else:
                break
        if lst == []:
            return None, None
        else:
            res1 = [[a["grade"],a["second"]] for a in lst]       
        if pprev is None:
            return res1, None
        else:
            i=0
            lstBy3=[]
            for a in self.dictBy3.find({"first": pprev,"second":prev}).sort([('grade',-1),('second',1)]):
                    if i<x:
                        lstBy3.append(a)
                        i+=1
                    else:
                        break
            if lstBy3 is []:
                return res1, None
            else:
                return res1,[[a["grade"],a["third"]] for a in lstBy3]
Esempio n. 19
0
class AutoCompModule:

    def __init__(self,DBName):
        self.dict = Connection()[DBName]['dict']
        self.dictBy2 = Connection()[DBName]['dictBy2']
        self.dictBy3 = Connection()[DBName]['dictBy3']
    
    def learnSingle(self,fileName):
        input = open(fileName, encoding='utf-8')
        for line in input:
            pprev = prev = None
            for word in line.split():
                if re.match("[.,\"\(\);']",word):
                    pprev = prev = word = None
                    continue
            
                if self.dict.find_one({"word": word,"amount": { "$exists": True}}) != None:
                    self.dict.update({"word": word},{ "$inc": {"amount":1}})
                else:
                    self.dict.insert({"word": word, "amount":1})
            
                if prev!=None:
                    if self.dictBy2.find_one({"first": prev,"second": word,"grade": { "$exists": True}}) != None:
                        self.dictBy2.update({"first": prev,"second": word},{ "$inc": {"grade":1}})
                    else:
                        self.dictBy2.insert({"first": prev,"second": word,"grade":1})
                    if pprev!=None:
                        if self.dictBy3.find_one({"first": pprev,"second": prev,"third": word,"grade": { "$exists": True}}) != None:
                                  self.dictBy3.update({"first": pprev,"second": prev,"third": word},{ "$inc": {"grade":1}})
                        else:
                            self.dictBy3.insert({"first": pprev,"second": prev,"third": word,"grade":1})
                    pprev=prev
                prev = word

        for entity in self.dictBy3.find():
            amount = self.dictBy2.find_one({"first": entity["first"],"second": entity["second"]})["grade"]
            self.dictBy3.update({"first": entity["first"],"second": entity["second"],"third": entity["third"]},{ "$set": {"grade": entity["grade"]/amount }})
        for entity in self.dictBy2.find():
            amount = self.dict.find_one({"word": entity["first"]})["amount"]
            self.dictBy2.update({"first": entity["first"],"second": entity["second"]}, { "$set": {"grade": entity["grade"]/amount}})

        input.close()

    def learn(self,inputDir):
        if os.path.isdir(inputDir):
            for f in sorted(os.listdir(inputDir)):
                self.learnSingle(inputDir + '/' + f)
            print ("SUCCESS LEARNING")
        else:
            print ("ERROR!!")

    def suggest(self,pprev=None,prev=None):
        if prev is None:
            return None , None
        if pprev is None:
            return self.dictBy2.find_one({"first": prev},sort=[("grade",-1),("second",1)])["second"] , None
        return self.dictBy2.find_one({"first": prev},sort=[("grade",-1),("second",1)])["second"] , self.dictBy3.find_one({"first": pprev, "second": prev},sort=[("grade",-1),("third",1)])["third"]
    
    def simpleTest(self, testFile, num):
        test = open(testFile,'r',encoding='utf-8')
        numOfChecks1 = numOfChecks2 = succ1 = succ2 = 0
        i = num
        for line in test:
            pprev = prev = None
            for word in line.split():
                if re.match("[.,\"\(\);']",word):
                    pprev = prev = word = None
                    i = num
                    continue
                if i!= 0:
                    i-=1
                    pprev = prev
                    prev = word
                else:
                    a,b = self.suggest(pprev,prev)
                    if a is not None:
                        if a is word:
                            succ1+=1
                        numOfChecks1+=1
                    if b is not None:
                        if b is word:
                            succ2+=1
                        numOfChecks2+=1
                    i=num
                    pprev=prev
                    prev=word
        test.close()
        return succ1/numOfChecks1, succ2/numOfChecks2
Esempio n. 20
0
def __updateNmap(nmap, fieldsToUpdate):
    nmapTask = Connection().phoenorama.nmapTask
    nmapTask.update({'task_uuid': nmap.task_uuid}, {'$set': fieldsToUpdate})
    return "Nmap was successfully updated"
Esempio n. 21
0
def __updateNmap(nmap, fieldsToUpdate):
    nmapTask = Connection().phoenorama.nmapTask
    nmapTask.update({'task_uuid': nmap.task_uuid}, {'$set' : fieldsToUpdate})
    return "Nmap was successfully updated"