Beispiel #1
0
class _ExpertCorpus(object):
    def __init__(self):

        member_query = """
        select * from expert_10_50
        """

        count_query = """
        select count(*) from expert_10_50
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec as l
        join list_member_rec as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='nlstudent' user = '******' password ='******'"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
        # self.members =[12, 50393960, 39247971, 39224224]

    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 100 == 0:
                print('Done', counter)

            self.cursor.execute(self.query, (member_id, ))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1],
                                                     description=result[2])

                expert_text.update(parsed_text['text'])

            terms = sorted([(e, v) for e, v in expert_text.items() if v > 1],
                           key=operator.itemgetter(1),
                           reverse=True)
            counter += 1

            if len(terms):
                if terms[0][1] > 10:
                    word_bag = []
                    for k, v in terms:
                        try:
                            word_bag.append((list_dict.token2id[k], v))
                        except KeyError:
                            pass
                    expert2doc[member_id] = doc_id
                    doc_id += 1
                    yield word_bag
Beispiel #2
0
class _ExpertCorpus(object):
    def __init__(self):

        member_query = """
        select * from expert_10_50
        """

        count_query = """
        select count(*) from expert_10_50
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec as l
        join list_member_rec as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='nlstudent' user = '******' password ='******'"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
        # self.members =[12, 50393960, 39247971, 39224224]

    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 100 == 0:
                print('Done', counter)

            self.cursor.execute(self.query, (member_id,))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1], description=result[2])

                expert_text.update(parsed_text['text'])

            terms = sorted([(e, v) for e, v in expert_text.items() if v > 1], key=operator.itemgetter(1), reverse=True)
            counter += 1

            if len(terms):
                if terms[0][1] > 10:
                    word_bag = []
                    for k, v in terms:
                        try:
                            word_bag.append((list_dict.token2id[k], v))
                        except KeyError:
                            pass
                    expert2doc[member_id] = doc_id
                    doc_id += 1
                    yield word_bag
Beispiel #3
0
class ExpertCorpus(object):
    def __init__(self):

        member_query = """
        select * from expert_420
        """

        count_query = """
        select count(*) from expert_420
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec_420 as l
        join list_member_rec_420 as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='list_6220' user = '******' password =''"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
        # self.members =[50393960, 39247971, 39224224]

    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 1000 == 0:
                print('Done', counter)

            print(member_id, count)
            self.cursor.execute(self.query, (member_id, ))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1],
                                                     description=result[2])

                expert_text.update(parsed_text['text'])
                # expert_text.update(parsed_text['bigrams'])

            terms = ((e, v) for e, v in expert_text.items()
                     if v > 10 and any([e.startswith(t) for t in topics]))
            counter += 1

            print(list(terms))

            word_bag = []
            for k, v in terms:
                try:
                    word_bag.append((list_dict.token2id[k], v))
                except KeyError:
                    pass
            expert2doc[member_id] = doc_id
            doc_id += 1

            yield word_bag
Beispiel #4
0
class ExpertCorpus(object):
    def __init__(self):

        member_query = """
        select * from expert_420
        """

        count_query = """
        select count(*) from expert_420
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec_420 as l
        join list_member_rec_420 as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='list_6220' user = '******' password =''"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
        # self.members =[50393960, 39247971, 39224224]

    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 1000 == 0:
                print('Done', counter)

            print(member_id, count)
            self.cursor.execute(self.query, (member_id,))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1], description=result[2])

                expert_text.update(parsed_text['text'])
                # expert_text.update(parsed_text['bigrams'])

            terms = ((e, v) for e, v in expert_text.items() if v > 10 and any([e.startswith(t) for t in topics]))
            counter += 1

            print(list(terms))

            word_bag = []
            for k, v in terms:
                try:
                    word_bag.append((list_dict.token2id[k], v))
                except KeyError:
                    pass
            expert2doc[member_id] = doc_id
            doc_id += 1

            yield word_bag