def build_parallel_corpus(): """ return dicts containing the parallel corpus entries """ con = psycopg2.connect(database='quora', user='******') cur = con.cursor(cursor_factory=psycopg2.extras.RealDictCursor) psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, cur) # we actually only need to cPickle the CountVectorizer object # so we can load the stop words and other preprocessing opt. index_directory = '/home/kyrre/michaeljackson' idx = Index.read_index(index_directory) query = """SELECT DISTINCT(Question.qid), concat(main, ' ', info) question , A.contents answers FROM Question JOIN (SELECT string_agg(content, ' ') as contents, qid FROM Answer GROUP BY qid) A ON Question.qid = A.qid LIMIT 5000; """ cur.execute(query) a = idx.count_vect.transform(SQL_generator(cur, 'answers')) cur.execute(query) q = idx.count_vect.transform(SQL_generator(cur, 'question')) od = lambda x: np.squeeze(np.asarray(x)) asum = od(a.sum(axis=1)) qsum = od(q.sum(axis=1)) nnz_indices = np.intersect1d(od(np.argwhere(asum != 0)), od(np.argwhere(qsum != 0))) a = a[nnz_indices,:] q = q[nnz_indices,:] asum = od(a.sum(axis=0)) qsum = od(q.sum(axis=0)) nnz_cols_indices = np.intersect1d(od(np.argwhere(asum != 0)), od(np.argwhere(qsum != 0))) a = a[:, nnz_cols_indices] q = q[:, nnz_cols_indices] assert a.shape == q.shape assert_sorted_indices(a) assert_sorted_indices(q) scipy.io.mmwrite(open('a.mtx', 'w'), a) scipy.io.mmwrite(open('q.mtx', 'w'), q)
def build_parallel_corpus(): """ return dicts containing the parallel corpus entries """ con = psycopg2.connect(database='quora', user='******') cur = con.cursor(cursor_factory=psycopg2.extras.RealDictCursor) psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, cur) # we actually only need to cPickle the CountVectorizer object # so we can load the stop words and other preprocessing opt. index_directory = '/home/kyrre/michaeljackson' idx = Index.read_index(index_directory) query = """SELECT DISTINCT(Question.qid), concat(main, ' ', info) question , A.contents answers from Question JOIN (SELECT string_agg(content, ' ') as contents, qid FROM Answer GROUP BY qid) A ON Question.qid = A.qid; """ cur.execute(query) # parallel corpora questions = OrderedDict() answers = OrderedDict() for record in cur: qdata, qindices = idx.count_vect.featurize(record['question']) adata, aindices = idx.count_vect.featurize(record['answers']) # skip "empty" entries if adata.size == 0 or qdata.size == 0: continue questions[record['qid']] = [qdata, qindices] answers[record['qid']] = [adata, aindices] container = Bunch(questions=questions, answers=answers, count_vect = idx.count_vect) return container