コード例 #1
0
ファイル: tag_experts.py プロジェクト: liamcreagh/Anthus-News
    def __init__(self):

        member_query = """
        select * from expert_10_50
        """

        count_query = """
        select count(*) from expert_10_50
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec as l
        join list_member_rec as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='nlstudent' user = '******' password ='******'"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
コード例 #2
0
ファイル: tag_experts.py プロジェクト: liamcreagh/Anthus-News
    def __init__(self):

        member_query = """
        select * from expert_10_50
        """

        count_query = """
        select count(*) from expert_10_50
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec as l
        join list_member_rec as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='nlstudent' user = '******' password ='******'"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
コード例 #3
0
import string
from nltk import FreqDist
import nltk
from psycopg2._psycopg import DatabaseError
import sys
from parsers.string_parser import StringParser, latin_letters
import psycopg2

__author__ = 'Katharine'

conn_string = "dbname='nlstudent' user = '******' password ='******'"

conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

f = StringParser()
s = {}

get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec as l JOIN
	list_member_rec as lm ON l.list_id = lm.list_id
	where lm.member_id = %s"""

get_listcount_for_member = """SELECT count FROM member_list_count_rec as lm JOIN
	list_rec as l ON l.list_id = lm.list_id
	where lm.member_id = %s;"""

get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l
	where list_id = %s"""

get_all_listinfo_for_all_lists = """SELECT l.list_id,l.list_name,l.list_description FROM list_rec as l"""
コード例 #4
0
import string
from nltk import FreqDist
import nltk
from psycopg2._psycopg import DatabaseError
import sys
from parsers.string_parser import StringParser, latin_letters
import psycopg2

__author__ = 'Katharine'

conn_string = "dbname='nlstudent' user = '******' password ='******'"

conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

f = StringParser()
s = {}

get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec as l JOIN
	list_member_rec as lm ON l.list_id = lm.list_id
	where lm.member_id = %s"""

# get_listcount_for_member = """SELECT count(lm.list_id) FROM list_member_rec as lm JOIN
# 	list_rec as l ON l.list_id = lm.list_id
# 	where lm.member_id = %s;"""
"""NB relies on table member_list_count_rec containing listcount for members """
get_listcount_for_members = """SELECT member_id,count FROM member_list_count_rec limit 1 offset 101;"""

get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l
	where list_id = %s"""
コード例 #5
0
ファイル: read_lists.py プロジェクト: liamcreagh/Anthus-News
import nltk
from parsers.string_parser import StringParser, latin_letters
import psycopg2

__author__ = 'Katharine'

# conn_string = "dbname='nlstudent' user = '******' password ='******'"
#
# conn = psycopg2.connect(conn_string)
# cursor = conn.cursor()

# read list names and descriptions from a file
# reader = codecs.open('no_wf_rec.csv', encoding='utf-8')
# reader_csv = csv.reader('no_wf_rec.csv','rb')
csv_file = 'no_wf_rec.csv'
f = StringParser()
s = {}
tstrout = ''

# get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec JOIN
# 	# list_member_rec ON list_rec.list_id = list_member_rec.list_id
# 	# where member_id = %s """
# cursor.execute(get_listinfo_for_member,21447363)

with open(csv_file) as csvfile:
    dialect = csv.Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = csv.reader(csvfile, dialect)

# rows = cursor.fetchall()
# for row in rows:
コード例 #6
0
from django.core.validators import URLValidator
from django.db.models import Q
import requests

__author__ = 'James'

from bs4 import BeautifulSoup
import feedparser
from content_acquisition.models import FeedRec
from articles.models import ArticleRec
from content_acquisition.ArticleWrapper import ArticleWrapper
from newspaper import Article, ArticleException
from parsers.string_parser import StringParser

clf = pickle.load(open('./pipe.p', 'rb'))
parser = StringParser()

from django import setup

setup()
val = URLValidator()


def aggregate():
    ArticleRec.objects.filter(
        article_published__lte=datetime.datetime.today() -
        datetime.timedelta(days=7)).delete()

    for f in shuffle(FeedRec.objects.all()):

        u = f.feed_url
コード例 #7
0
ファイル: tag_experts.py プロジェクト: liamcreagh/Anthus-News
class _ExpertCorpus(object):
    def __init__(self):

        member_query = """
        select * from expert_10_50
        """

        count_query = """
        select count(*) from expert_10_50
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec as l
        join list_member_rec as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='nlstudent' user = '******' password ='******'"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
        # self.members =[12, 50393960, 39247971, 39224224]

    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 100 == 0:
                print('Done', counter)

            self.cursor.execute(self.query, (member_id, ))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1],
                                                     description=result[2])

                expert_text.update(parsed_text['text'])

            terms = sorted([(e, v) for e, v in expert_text.items() if v > 1],
                           key=operator.itemgetter(1),
                           reverse=True)
            counter += 1

            if len(terms):
                if terms[0][1] > 10:
                    word_bag = []
                    for k, v in terms:
                        try:
                            word_bag.append((list_dict.token2id[k], v))
                        except KeyError:
                            pass
                    expert2doc[member_id] = doc_id
                    doc_id += 1
                    yield word_bag
コード例 #8
0
ファイル: tag_experts.py プロジェクト: liamcreagh/Anthus-News
class ExpertCorpus(object):
    def __init__(self):

        member_query = """
        select * from expert_420
        """

        count_query = """
        select count(*) from expert_420
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec_420 as l
        join list_member_rec_420 as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='list_6220' user = '******' password =''"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
        # self.members =[50393960, 39247971, 39224224]

    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 1000 == 0:
                print('Done', counter)

            print(member_id, count)
            self.cursor.execute(self.query, (member_id, ))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1],
                                                     description=result[2])

                expert_text.update(parsed_text['text'])
                # expert_text.update(parsed_text['bigrams'])

            terms = ((e, v) for e, v in expert_text.items()
                     if v > 10 and any([e.startswith(t) for t in topics]))
            counter += 1

            print(list(terms))

            word_bag = []
            for k, v in terms:
                try:
                    word_bag.append((list_dict.token2id[k], v))
                except KeyError:
                    pass
            expert2doc[member_id] = doc_id
            doc_id += 1

            yield word_bag
コード例 #9
0
ファイル: read_lists.py プロジェクト: liamcreagh/Anthus-News
import nltk
from parsers.string_parser import StringParser, latin_letters
import psycopg2

__author__ = 'Katharine'

# conn_string = "dbname='nlstudent' user = '******' password ='******'"
#
# conn = psycopg2.connect(conn_string)
# cursor = conn.cursor()

# read list names and descriptions from a file
# reader = codecs.open('no_wf_rec.csv', encoding='utf-8')
# reader_csv = csv.reader('no_wf_rec.csv','rb')
csv_file = 'no_wf_rec.csv'
f = StringParser()
s = {}
tstrout = ''

# get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec JOIN
# 	# list_member_rec ON list_rec.list_id = list_member_rec.list_id
# 	# where member_id = %s """
# cursor.execute(get_listinfo_for_member,21447363)

with open(csv_file) as csvfile:
       dialect = csv.Sniffer().sniff(csvfile.read(1024))
       csvfile.seek(0)
       reader = csv.reader(csvfile, dialect)

# rows = cursor.fetchall()
# for row in rows:
コード例 #10
0
import csv
import string
from nltk import FreqDist
import nltk
from parsers.string_parser import StringParser, latin_letters
import psycopg2
__author__ = 'Katharine'

conn_string = "dbname='nlstudent' user = '******' password ='******'"

conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

f = StringParser()
s = {}

get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l
	where list_id = %s"""


def get_list_dists_for(list_id):
    cursor.execute(get_listinfo_for_list, [list_id])

    rows = cursor.fetchall()
    tstrout = ''
    for row in rows:
        c_line = str(row)
        c_line = ''.join(filter(lambda x: x in string.printable, c_line))
        # print(c_line)
        if len(c_line):
            s = f.parse(c_line, True)
コード例 #11
0
import string
from nltk import FreqDist
import nltk
from psycopg2._psycopg import DatabaseError
import sys
from parsers.string_parser import StringParser, latin_letters
import psycopg2

__author__ = 'Katharine'

conn_string = "dbname='nlstudent' user = '******' password ='******'"

conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

f = StringParser()
s = {}

get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec as l JOIN
	list_member_rec as lm ON l.list_id = lm.list_id
	where lm.member_id = %s"""

get_listcount_for_member = """SELECT count FROM member_list_count_rec as lm JOIN
	list_rec as l ON l.list_id = lm.list_id
	where lm.member_id = %s;"""

get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l
	where list_id = %s"""

get_all_listinfo_for_all_lists = """SELECT l.list_id,l.list_name,l.list_description FROM list_rec as l"""
コード例 #12
0
ファイル: tag_experts.py プロジェクト: liamcreagh/Anthus-News
class _ExpertCorpus(object):
    def __init__(self):

        member_query = """
        select * from expert_10_50
        """

        count_query = """
        select count(*) from expert_10_50
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec as l
        join list_member_rec as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='nlstudent' user = '******' password ='******'"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
        # self.members =[12, 50393960, 39247971, 39224224]

    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 100 == 0:
                print('Done', counter)

            self.cursor.execute(self.query, (member_id,))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1], description=result[2])

                expert_text.update(parsed_text['text'])

            terms = sorted([(e, v) for e, v in expert_text.items() if v > 1], key=operator.itemgetter(1), reverse=True)
            counter += 1

            if len(terms):
                if terms[0][1] > 10:
                    word_bag = []
                    for k, v in terms:
                        try:
                            word_bag.append((list_dict.token2id[k], v))
                        except KeyError:
                            pass
                    expert2doc[member_id] = doc_id
                    doc_id += 1
                    yield word_bag
コード例 #13
0
ファイル: tag_experts.py プロジェクト: liamcreagh/Anthus-News
class ExpertCorpus(object):
    def __init__(self):

        member_query = """
        select * from expert_420
        """

        count_query = """
        select count(*) from expert_420
        """

        self.query = """
        select l.list_id,l.list_name,l.list_description from list_rec_420 as l
        join list_member_rec_420 as lm1
        on lm1.list_id=l.list_id
        where lm1.member_id = %s;
        """

        conn_string = "dbname='list_6220' user = '******' password =''"

        self.connection = connect(conn_string)
        self.ind = 0
        self.parser = StringParser()
        self.cursor = self.connection.cursor()
        self.cursor.execute(member_query)
        self.members = self.cursor.fetchall()

        self.cursor.execute(count_query)
        self.N_members = self.cursor.fetchone()[0]
        print(self.N_members)
        # self.members =[50393960, 39247971, 39224224]

    def __iter__(self):
        list_dict = Dictionary.load('terms.dict')
        # list_dict.filter_extremes(no_below=1000,no_above=0.99)
        counter = 0
        doc_id = 0
        for member_id, count in self.members:
            if counter % 1000 == 0:
                print('Done', counter)

            print(member_id, count)
            self.cursor.execute(self.query, (member_id,))
            expert_text = Counter()

            for result in self.cursor:
                parsed_text = self.parser.parse_list(title=result[1], description=result[2])

                expert_text.update(parsed_text['text'])
                # expert_text.update(parsed_text['bigrams'])

            terms = ((e, v) for e, v in expert_text.items() if v > 10 and any([e.startswith(t) for t in topics]))
            counter += 1

            print(list(terms))

            word_bag = []
            for k, v in terms:
                try:
                    word_bag.append((list_dict.token2id[k], v))
                except KeyError:
                    pass
            expert2doc[member_id] = doc_id
            doc_id += 1

            yield word_bag
コード例 #14
0
import string
from nltk import FreqDist
import nltk
from psycopg2._psycopg import DatabaseError
import sys
from parsers.string_parser import StringParser, latin_letters
import psycopg2

__author__ = 'Katharine'

conn_string = "dbname='nlstudent' user = '******' password ='******'"

conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

f = StringParser()
s = {}

get_listinfo_for_member = """SELECT l.list_name,l.list_description FROM list_rec as l JOIN
	list_member_rec as lm ON l.list_id = lm.list_id
	where lm.member_id = %s"""

# get_listcount_for_member = """SELECT count(lm.list_id) FROM list_member_rec as lm JOIN
# 	list_rec as l ON l.list_id = lm.list_id
# 	where lm.member_id = %s;"""

"""NB relies on table member_list_count_rec containing listcount for members """
get_listcount_for_members = """SELECT member_id,count FROM member_list_count_rec limit 1 offset 101;"""


get_listinfo_for_list = """SELECT l.list_name,l.list_description FROM list_rec as l