def recommend_by_url(url): parsed = urlparse(url) doc = Document(requests.get(url).content) content = html.fromstring(doc.content()).xpath('string()') bigrams = make_bigrams(content) vec_bow = dictionary.doc2bow(bigrams) vec_lsi = lsi[vec_bow] sims = index[vec_lsi] #print sims docs = sorted(list(enumerate(sims)), key=lambda item: -item[1]) results, seen = [], [] for doc, score in docs: res = ARTICLES[doc] if not 'url' in res or res['url'] in seen: continue seen.append(res['url']) p = urlparse(res['url']) if p.hostname.endswith(parsed.hostname): continue res['score'] = float(score) if 'content' in res: del res['content'] if 'html' in res: del res['html'] if res['summary']: res['summary'] = res['summary'].strip() results.append(res) if len(results) > 14: break return results
def generate_texts(): with open('texts.json', 'wb') as fh: texts = [] for i, article in enumerate(table): article['bigrams'] = make_bigrams(article.get('content', '')) print [i, len(article['bigrams'])] texts.append(article['bigrams'] + [article.get('url')]) json.dump(texts, fh) return texts