def build_vectors(articles, weights): """ Build weighted vector representations for a list of articles. """ pub_vecs, bow_vecs, con_vecs = [], [], [] for a in articles: pub_vecs.append(np.array([a.published])) bow_vecs.append(vectorize(a.text)) con_vecs.append(concept_vectorize([c.slug for c in a.concepts])) pub_vecs = normalize(csr_matrix(pub_vecs), copy=False) bow_vecs = normalize(csr_matrix(bow_vecs), copy=False) con_vecs = normalize(csr_matrix(con_vecs), copy=False) # Merge vectors. vecs = hstack([pub_vecs, bow_vecs, con_vecs]) # Convert to a scipy.sparse.lil_matrix because it is subscriptable. vecs = vecs.tolil() # Apply weights to the proper columns: # col 0 = pub, cols 1-101 = bow, 102+ = concepts # weights = [pub, bow, concept] vecs[:,0] *= weights[0] vecs[:,1:101] *= weights[1] vecs[:,101:] *= weights[2] return vecs.toarray()
'image': 'http://www.argos.la/image.jpg', 'name': name } return None def faux_uri_for_name(name): return "http://fauxpedia.org/resource/{0}".format(name) def faux_commonness_for_name(name): return 100 def faux_commonness_for_uri(name): return 100 def faux_concepts(docs): return ['Nautilus', 'Picard'] def faux_summarize(title, text): return ['this', 'is', 'a', 'fake', 'summary'] def faux_multisummarize(docs): return ['this', 'is', 'a', 'fake', 'summary'] from galaxy import vectorize cached_vector = vectorize('foo bar') def faux_vectorize(docs): return cached_vector def faux_save_from_url(url, filename): return 'https://s3.amazon.com/fakeimage.jpg'
def vectors(self): return vectorize(self.text)