Esempio n. 1
0
def build_parallel_corpus():
    """ return dicts containing the parallel corpus 
        entries 
    """

    con = psycopg2.connect(database='quora', user='******')
    cur = con.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, cur)

    # we actually only need to cPickle the CountVectorizer object
    # so we can load the stop words and other preprocessing opt.
    index_directory = '/home/kyrre/michaeljackson'
    idx = Index.read_index(index_directory)

    query = """SELECT DISTINCT(Question.qid), concat(main, ' ', info) question , A.contents answers 
               FROM Question 
               JOIN (SELECT string_agg(content, ' ') as contents, qid FROM Answer GROUP BY qid) A 
               ON Question.qid = A.qid LIMIT 5000;
            """

    cur.execute(query)
    a = idx.count_vect.transform(SQL_generator(cur, 'answers'))

    cur.execute(query)
    q = idx.count_vect.transform(SQL_generator(cur, 'question'))

    od = lambda x: np.squeeze(np.asarray(x))

    asum = od(a.sum(axis=1))
    qsum = od(q.sum(axis=1))

    nnz_indices = np.intersect1d(od(np.argwhere(asum != 0)), 
                                od(np.argwhere(qsum != 0)))
    a = a[nnz_indices,:]
    q = q[nnz_indices,:]

    asum = od(a.sum(axis=0))
    qsum = od(q.sum(axis=0))

    nnz_cols_indices = np.intersect1d(od(np.argwhere(asum != 0)), 
                                      od(np.argwhere(qsum != 0)))

    a = a[:, nnz_cols_indices]
    q = q[:, nnz_cols_indices]

    assert a.shape == q.shape

    assert_sorted_indices(a)
    assert_sorted_indices(q)

    scipy.io.mmwrite(open('a.mtx', 'w'), a)
    scipy.io.mmwrite(open('q.mtx', 'w'), q)
Esempio n. 2
0
def build_parallel_corpus():
    """ return dicts containing the parallel corpus 
        entries 
    """

    con = psycopg2.connect(database='quora', user='******')
    cur = con.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, cur)

    # we actually only need to cPickle the CountVectorizer object
    # so we can load the stop words and other preprocessing opt.
    index_directory = '/home/kyrre/michaeljackson'
    idx = Index.read_index(index_directory)

    query = """SELECT DISTINCT(Question.qid), concat(main, ' ', info) question , A.contents answers from Question JOIN (SELECT
            string_agg(content, ' ') as contents, qid FROM Answer GROUP BY qid) A ON
            Question.qid = A.qid;
            """

    cur.execute(query)

    # parallel corpora
    questions = OrderedDict()
    answers = OrderedDict()

    for record in cur:
        
        qdata, qindices = idx.count_vect.featurize(record['question'])   
        adata, aindices = idx.count_vect.featurize(record['answers'])   

        # skip "empty" entries 
        if adata.size == 0 or qdata.size == 0:
            continue 

        questions[record['qid']] = [qdata, qindices]
        answers[record['qid']] = [adata, aindices]
    
    container = Bunch(questions=questions, answers=answers, count_vect = idx.count_vect)

    return container