Ejemplo n.º 1
0
    def get_by_model(self, queryset_or_model, tags):
        """
        Create a ``QuerySet`` containing instances of the specified
        model associated with a given tag or list of tags.
        """
        tags = get_tag_list(tags)
        tag_count = len(tags)
        if tag_count == 0:
            # No existing tags were given
            queryset, model = get_queryset_and_model(queryset_or_model)
            return model._default_manager.none()
        elif tag_count == 1:
            # Optimisation for single tag - fall through to the simpler
            # query below.
            tag = tags[0]
        else:
            return self.get_intersection_by_model(queryset_or_model, tags)

        queryset, model = get_queryset_and_model(queryset_or_model)
        content_type = ContentType.objects.get_for_model(model)
        opts = self.model._meta
        tagged_item_table = qn(opts.db_table)
        return queryset.extra(
            tables=[opts.db_table],
            where=[
                '%s.content_type_id = %%s' % tagged_item_table,
                '%s.tag_id = %%s' % tagged_item_table,
                '%s.%s = %s.object_id' % (qn(model._meta.db_table),
                                          qn(model._meta.pk.column),
                                          tagged_item_table)
            ],
            params=[content_type.pk, tag.pk],
        )
Ejemplo n.º 2
0
def user_rep():
    openfile = open("Training_Body_Title_user.p", "rb")
    x = pickle.load(openfile)

    repre = {}
    count = 0

    use = 0

    for y in x:
        tag_string = y['tags'].encode('utf-8')
        #print tag_string
        tag_list = utils.get_tag_list(tag_string)
        tag_enc = get_tag_encoding(tag_list)

        count += 1
        print count

        try:
            repre[user_id[y['OwnerUserId']]] += tag_enc
        except:
            try:
                repre[user_id[y['OwnerUserId']]] = np.zeros(len(tag_dict))
                repre[user_id[y['OwnerUserId']]] += tag_enc
            except:
                #	use += 1
                #	print use
                continue

    for key in repre:
        repre[key] = softmax(repre[key])
        print repre[key].shape

    return repre
Ejemplo n.º 3
0
    def related_for_model(self, tags, model, counts=False, min_count=None):
        """
        Obtain a list of tags related to a given list of tags - that
        is, other tags used by items which have all the given tags.

        If ``counts`` is True, a ``count`` attribute will be added to
        each tag, indicating the number of items which have it in
        addition to the given list of tags.

        If ``min_count`` is given, only tags which have a ``count``
        greater than or equal to ``min_count`` will be returned.
        Passing a value for ``min_count`` implies ``counts=True``.
        """
        from models import TaggedItem
        if min_count is not None: counts = True
        tags = get_tag_list(tags)
        tag_count = len(tags)
        tagged_item_table = qn(TaggedItem._meta.db_table)
        query = """
        SELECT %(tag)s.id, %(tag)s.name%(count_sql)s
        FROM %(tagged_item)s INNER JOIN %(tag)s ON %(tagged_item)s.tag_id = %(tag)s.id
        WHERE %(tagged_item)s.content_type_id = %(content_type_id)s
          AND %(tagged_item)s.object_id IN
          (
              SELECT %(tagged_item)s.object_id
              FROM %(tagged_item)s, %(tag)s
              WHERE %(tagged_item)s.content_type_id = %(content_type_id)s
                AND %(tag)s.id = %(tagged_item)s.tag_id
                AND %(tag)s.id IN (%(tag_id_placeholders)s)
              GROUP BY %(tagged_item)s.object_id
              HAVING COUNT(%(tagged_item)s.object_id) = %(tag_count)s
          )
          AND %(tag)s.id NOT IN (%(tag_id_placeholders)s)
        GROUP BY %(tag)s.id, %(tag)s.name
        %(min_count_sql)s
        ORDER BY %(tag)s.name ASC""" % {
            'tag': qn(self.model._meta.db_table),
            'count_sql': counts and ', COUNT(%s.object_id)' % tagged_item_table or '',
            'tagged_item': tagged_item_table,
            'content_type_id': ContentType.objects.get_for_model(model).pk,
            'tag_id_placeholders': ','.join(['%s'] * tag_count),
            'tag_count': tag_count,
            'min_count_sql': min_count is not None and ('HAVING COUNT(%s.object_id) >= %%s' % tagged_item_table) or '',
        }

        params = [tag.pk for tag in tags] * 2
        if min_count is not None:
            params.append(min_count)

        cursor = connection.cursor()
        cursor.execute(query, params)
        related = []
        for row in cursor.fetchall():
            tag = self.model(*row[:2])
            if counts is True:
                tag.count = row[2]
            related.append(tag)
        return related
Ejemplo n.º 4
0
def train():
    openfile = open("Training_Body_Title_user.p", "rb")
    x = pickle.load(openfile)
    #print "x:",len(x)

    for loop in xrange(5):
        #print "loop: ",loop
        x_train = []
        user = []
        y_train = []
        cnt = 0
        trace = 0
        for o, y in enumerate(x):
            try:
                question = y['Title'].encode('utf-8') + ' '
            except:
                question = ''
            question = question + y['Body'].encode('utf-8')
            question = utils.clean_question(question)

            tag_string = y['tags'].encode('utf-8')
            #print tag_string
            tag_list = utils.get_tag_list(tag_string)

            question_enc = get_question_embedding(question)
            tag_enc = get_tag_encoding(tag_list)

            cnt = cnt + 1
            x_train.append(question_enc)
            y_train.append(tag_enc)
            try:
                user.append(meta_model[str(user_num[y['OwnerUserId']])])
            except:
                #trace += 1
                #print trace
                user.append(np.zeros(128))

            if cnt == batch_size:
                x_train = np.asarray(x_train)
                y_train = np.asarray(y_train)
                user = np.asarray(user)
                print "cnt: ", cnt, " loop: ", loop, " o: ", o

                # print (x_train.shape)
                #model.fit([x_train, user],y_train, epochs=1)
                model.fit(x_train, y_train, epochs=1)
                #model.save('model4_train_add_Body_Title_gru_epochs10.h5')
                #model.save_weights('model4_train_add_Body_Title_weights_gru_epochs10.h5')
                model.save(
                    'model4_train_DeepTagRecContent_usingAdd_sigmoid.h5')
                model.save_weights(
                    'model4_train_DeepTagRecContent_weights_usingAdd_sigmoid.h5'
                )
                x_train = []
                user = []
                y_train = []

                cnt = 0
Ejemplo n.º 5
0
    def get_intersection_by_model(self, queryset_or_model, tags):
        """
        Create a ``QuerySet`` containing instances of the specified
        model associated with *all* of the given list of tags.
        """
        tags = get_tag_list(tags)
        tag_count = len(tags)
        queryset, model = get_queryset_and_model(queryset_or_model)

        if not tag_count:
            return model._default_manager.none()

        model_table = qn(model._meta.db_table)
        # This query selects the ids of all objects which have all the
        # given tags.
        query = """
        SELECT %(model_pk)s
        FROM %(model)s, %(tagged_item)s
        WHERE %(tagged_item)s.content_type_id = %(content_type_id)s
          AND %(tagged_item)s.tag_id IN (%(tag_id_placeholders)s)
          AND %(model_pk)s = %(tagged_item)s.object_id
        GROUP BY %(model_pk)s
        HAVING COUNT(%(model_pk)s) = %(tag_count)s""" % {
            'model_pk': '%s.%s' % (model_table, qn(model._meta.pk.column)),
            'model': model_table,
            'tagged_item': qn(self.model._meta.db_table),
            'content_type_id': ContentType.objects.get_for_model(model).pk,
            'tag_id_placeholders': ','.join(['%s'] * tag_count),
            'tag_count': tag_count,
        }

        cursor = connection.cursor()
        cursor.execute(query, [tag.pk for tag in tags])
        object_ids = [row[0] for row in cursor.fetchall()]
        if len(object_ids) > 0:
            return queryset.filter(pk__in=object_ids)
        else:
            return model._default_manager.none()
Ejemplo n.º 6
0
    def __init__(self, corpus):

        self.grammar = {}
        self.lexicon = {}

        self.get_pcfg(corpus)

        self.freq_tokens = {}
        for tag in self.lexicon.keys():
            for word in self.lexicon[tag].keys():
                if word in self.freq_tokens.keys():
                    self.freq_tokens[word] += self.lexicon[tag][word]
                else:
                    self.freq_tokens[word] = self.lexicon[tag][word]
        sum = np.sum(list(self.freq_tokens.values()))
        for word in self.freq_tokens:
            self.freq_tokens[word] /= sum

        self.set_artificial_tags = set()
        self.chomskyfy()

        self.freq_terminal_tags = {tag: np.sum(list(counts.values())) for (tag, counts) in self.lexicon.items()}
        sum = np.sum(list(self.freq_terminal_tags.values()))
        for tag in self.freq_terminal_tags:
            self.freq_terminal_tags[tag] /= sum

        self.grammar = get_prob(self.grammar)
        self.lexicon = get_prob(self.lexicon)

        list_all_tags = get_tag_list(self.grammar)
        self.list_artificial_symbols = list(self.set_artificial_tags)
        self.list_tags = list(set(list_all_tags).difference(self.set_artificial_tags))

        self.list_all_tags = self.list_tags + self.list_artificial_symbols
        self.nb_tags = len(self.list_tags)
        self.nb_all_tags = len(self.list_all_tags)
Ejemplo n.º 7
0
def test():
    # openfile = open("ValidationPosts_15000.pickle", "rb")
    openfile = open("score_low.p", "rb")
    x = pickle.load(openfile)

    x_test = []
    user = []
    actual = []
    cnt = 0
    correct = 0
    precision = 0.0
    recall = 0.0
    total = 0
    count = 0
    for y in x:
        #if total > 5:
        #	break
        #question = y['Body'].encode('utf-8')

        question = y['Title'].encode('utf-8')
        question = question + ' ' + y['Body'].encode('utf-8')
        question = utils.clean_question(question)

        tag_string = y['Tags'].encode('utf-8')
        tag_list = utils.get_tag_list(tag_string)

        question_enc = get_question_embedding(question)
        tag_enc = get_tag_encoding(tag_list)
        try:
            user.append(meta_model[str(user_num[y['OwnerUserId']])])
        except:
            #trace += 1
            #print trace
            user.append(np.zeros(128))

        cnt = cnt + 1
        #print cnt
        x_test.append(question_enc)
        actual.append(np.asarray(tag_enc))
        #print(cnt)

    user = np.asarray(user)
    x_test = np.asarray(x_test)
    #s = model.predict([x_test, user])
    s = model.predict(x_test)
    actual = np.asarray(actual)
    predicted = s  #run_user(s, user)
    #print predicted
    #break
    # (t_correct, t_total) = calc_precision_new(actual,s)
    #(t_num, t_den_p, t_den_r, t_total) = evaluate(actual,s)

    #num += t_num
    #den_p += t_den_p
    #den_r += t_den_r
    # correct += t_correct
    #total += t_total
    # print "correct = ", correct
    #print "total = ", total
    #print "============="
    #predicted = []
    #dict1 = pickle.load(open("predicted.p", 'rb'))
    #for i in dict1.keys():
    #	predicted.append(dict1[i])
    #predicted = np.asarray(predicted)
    #actual = np.asarray(actual)
    #'''
    #print 'I m done'
    #break
    #break

    for i in [3, 5, 10]:
        #print actual
        precision = 0.0
        recall = 0.0
        total = 0

        p, r, t = evaluate(actual, predicted, i)
        precision += p * t
        recall += r * t
        total += t

        precision = precision / total
        recall = recall / total
        #precision = (float(num)/den_p)/ float(total)
        #recall = (float(num)/den_r)/ float(total)

        print "Precision @" + str(i) + ": ", precision
        print "Recall @" + str(i) + ": ", recall