Ejemplo n.º 1
0
    def foldin_item(self, itemid, text):
        global users_items, uu_enh, ii_enh, docids
        # TODO check if itemid already exists???

        # first, deal with folding in the new item!
        ttdm = tfidf.transform([text])
        dT = ttdm.todense()
        d_hat = np.dot(dT, self.U).dot(self.SI)
        self.VT = np.hstack((lsi.VT, d_hat.T))

        # next, calculate this document's similarity vs. each user's reading
        # history
        # insert similarity score into users_items sparse matrix IFF exceed thr
        # create virtual doc
        A = sps.lil_matrix(users_items)
        A._shape = (A.shape[0], A.shape[1] + 1)
        for user in userids:
            ui = np.where(userids == user)[0].item()
            dd = users_items.getrow(ui).nonzero()[1]  # <<< including similar,
            #     but unread items!
            tids = []
            for did in docids[dd]:
                tids.extend(terms[corpus[did]])
            vd = " ".join(tids)
            #            tt = []
            #            for d in dd:
            #                tt.append()
            #            vd = " ".join( np.concatenate([corp[e] for e in dd]) )

            tmp = tfidf.transform([vd])
            qT = tmp.todense()
            q_hat = np.dot(qT, self.U).dot(self.SI)
            s = cossim(q_hat, d_hat)
            # >>> threshold also used when folding in new items <<<
            if s > 0.2:
                A[ui, -1] = s

        # TODO is this really the right thing to do???                    !!!!!
        # this here is O(n^2)                                             !!!!!
        users_items = A.tocsr()
        uu_enh = cossim(users_items)
        ii_enh = cossim(users_items.transpose())

        docids = np.append(docids, itemid)
        corpus[itemid] = ttdm.indices

        print "Docids: ", docids
        print corpus
Ejemplo n.º 2
0
def graphAll(df,
             max_features=1000,
             keep_unconnected=False,
             lowerThresh=.9,
             upperThresh=1.1):
    """
    Option for graphing the similarity of stories.
    If keep_unconnected == True, nodes with no connecting edges will be included.
    df -- contains the stories (newspaper name, headlines, body text)
    max_features -- dimension of TFiDF vectorization of body text from collection of day's stories
    lowerThresh -- lower bound for similarity of cossim measure for connecting story nodes
    upperThresh -- upper bound to lowerThresh
    """
    corpus = df.loc[:, 'body'].fillna('').str.lower().values
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(corpus)
    G = nx.Graph()
    edges = []
    for x in range(0, len(df)):
        simMeasures = cossim(X[x], X)
        matches = df.loc[(simMeasures[0] >= lowerThresh) &
                         (simMeasures[0] <= upperThresh), :].index
        if keep_unconnected == False:
            matches = [elm for elm in matches if elm != x]
        for elm in matches:
            edges.append((x, elm))
    G.add_edges_from(edges)
    return G, vectorizer
    def summarize(self, document, sum_len=5):
        """ Centroid defined as the sum of all tfidf vectors of the document.
        Summaries are made by greedily adding in sentences such that summed cossim
        between the summary vector and the centroid is maximized at each iteration,
        until sum_len is reached.  
        
        Input:
            document: Document class containing list of Sentence instances
            sum_len: length for the summary to be (int)
        Output:
            list of sentences that 'summarize' the input document.
        """
        self.document = document
        
        self.submatrix = self.matrix[document.i1:document.i2]
        self.centroid = sum(self.submatrix[:])
        self.submatrix = self.method()

        summary, sum_idx = [], []

        while len(summary) < sum_len:
            sims = np.ravel([cossim(self.centroid, sum(summary + [i]))
                             for i in self.submatrix])
            summary.append(self.submatrix[np.argmax(sims)])
            sum_idx.append(np.argmax(sims))
        
        return [self.document.text[i] for i in sum_idx]
Ejemplo n.º 4
0
def getsimmatrix(embvectors, N, embchars):
    corrmat = np.zeros((len(embchars), len(embchars)))

    for i, e in enumerate(embvectors):
        if not i in embchars:
            continue
        for j, d in enumerate(embvectors):
            if not j in embchars:
                continue
            corrmat[i][j] = cossim(e.reshape(1, -1), d.reshape(1, -1))
    return corrmat
Ejemplo n.º 5
0
def update_user_reading(user, item):
    # TODO isn't there some other way to do this right?
    global users_items, uu_enh, ii_enh

    ui = np.where(userids == user)[0]
    ii = np.where(docids == item)[0]

    if ui.size == 0:
        raise Exception("User %s not found!" % user)
    if ii.size == 0:
        raise Exception("Item %s not found!" % item)

    ui = ui.item()
    ii = ii.item()

    tmp = sps.lil_matrix(users_items)
    tmp[ui, ii] = 1.0

    # TODO is this really the right thing to do???                        !!!!!
    # this here is O(n^2)                                                 !!!!!
    users_items = tmp.tocsr()
    uu_enh = cossim(users_items)
    ii_enh = cossim(users_items.transpose())

    # TODO is this really the right thing to do???                        !!!!!
    ur = users_items.getrow(ui)
    ri = ur.indices[np.where(ur.data == 1.0)]

    tids = []
    for did in docids[ri]:
        tids.extend(terms[corpus[did]])
    vd = " ".join(tids)
    #vd = " ".join( np.concatenate([corp[e] for e in d]) )
    _t = tfidf.transform([vd])
    ti = np.argsort(-_t.data)
    users_terms[user] = zip(_t.indices[ti], _t.data[ti])
Ejemplo n.º 6
0
def get_similarity_matrix(embedded_vectors, embedded_chars):
    """

    :param embedded_vectors:
    :param embedded_chars:
    :return:
    """
    correlation_matrix = np.zeros((len(embedded_chars), len(embedded_chars)))

    for i, embedded_vector1 in enumerate(embedded_vectors):
        if i not in embedded_chars or embedded_vector1 is None:
            continue
        for j, embedded_vector2 in enumerate(embedded_vectors):
            if j not in embedded_chars or embedded_vector2 is None:
                continue
            correlation_matrix[i][j] = cossim(embedded_vector1.reshape(1, -1), embedded_vector2.reshape(1, -1))
    return correlation_matrix
Ejemplo n.º 7
0
def cosine_similarity(v1, v2):
    if v1 == [] or v2 == []:
        return 0
    score = cossim(array(v1).reshape(1, -1), array(v2).reshape(1, -1))
    return score[0][0]
Ejemplo n.º 8
0
 def query(self, qT):
     q_hat = np.dot(qT, self.U).dot(self.SI)
     res = cossim(q_hat, self.VT.transpose()).flatten()
     return res
Ejemplo n.º 9
0
    res = lsi.query(qT)

    # don't include values for docs already read!
    res[d] = 0
    # TODO parameterize this arbitrary threshold?
    # >>> threshold also used when folding in new items <<<
    ti = np.where(res > 0.20)[0]
    tmpi.extend(ti)
    tmpu.extend(np.tile(i, ti.size))
    tmpd.extend(res[ti])

tmpm = sps.csr_matrix((tmpd, (tmpu, tmpi)), shape=users_items.shape)
users_items = users_items + tmpm

# User-user similarity matrix, enhanced"
uu_enh = cossim(users_items)
uu_raw = cossim(users_items_raw)

# Item-item similarity matrix, enhanced"
ii_enh = cossim(users_items.transpose())
ii_raw = cossim(users_items_raw.transpose())


def __get_user_row(user):
    uid = np.where(userids == user)[0]
    if len(uid) > 0:
        uid = uid.item()
        return users_items.getrow(uid)
    else:
        return None
Ejemplo n.º 10
0
def graphAddedEdges(G, df, vectorizer, cosSim_thresh=.5):
    """
    Deals with the issue that same topic was split into a few different clusters thanks to language nuance 
    (and potentially the different amount of space each editor gave the story's body text).
    Links groups with similar average cos sim score.
    """
    connectedStories = sorted(nx.connected_component_subgraphs(G),
                              key=len,
                              reverse=True)
    connectedStoriesCount = [len(elm.nodes()) for elm in connectedStories]
    #finding subgraphs:
    subGraphs = range(0, connectedStoriesCount.index(2))
    dfSub = pd.DataFrame(
        columns=['filename', 'headline', 'body', 'GraphGroup'])
    for i in subGraphs:
        dfSub0 = df.loc[list(connectedStories[i].nodes()), :]
        dfSub0['GraphGroup'] = i
        dfSub = pd.concat([dfSub, dfSub0], sort=True)
    lsa = decomposition.TruncatedSVD(n_components=5,
                                     algorithm='randomized',
                                     n_iter=5)
    Xsub = vectorizer.transform(dfSub.loc[:, 'body'].values)
    XsubLsa = lsa.fit_transform(Xsub)

    #finding mean cos sim score for each subgroup:
    groupMeans = np.array(
        np.mean(Xsub[(dfSub['GraphGroup'] == subGraphs[0]).values], axis=0))
    for i in subGraphs[1:]:
        groupMeans = np.concatenate([
            groupMeans,
            np.array(
                np.mean(Xsub[(dfSub['GraphGroup'] == subGraphs[i]).values],
                        axis=0))
        ])

    #linking subgroups with rel. high cos sim score
    addedEdges = []
    combined_groups = []
    for i in subGraphs:  #range(22,len(subGraphs)):#subGraphs:
        simScores = cossim(groupMeans[i].reshape(1, -1), groupMeans)
        possibleMissedconnections = [
            elm[0] for elm in list(
                zip(range(0, len(simScores[0])), (
                    simScores > cosSim_thresh)[0])) if elm[1] == True
        ]
        combined_groups.append(possibleMissedconnections)
        if len(possibleMissedconnections) > 1:
            for j in [elm for elm in possibleMissedconnections if elm != i]:
                u = list(connectedStories[i].nodes)[
                    0]  #np.random.choice(np.array(connectedStories[i].nodes))
                v = list(connectedStories[j].nodes)[
                    0]  #np.random.choice(np.array(connectedStories[j].nodes))
                G.add_edge(u, v)
                addedEdges.append((u, v))

    # Finding the clusters of larger groups
    metaGrouped = []
    for i in range(0, len(combined_groups)):
        grouped = []
        for elm in combined_groups[i]:
            for group in combined_groups[i:]:
                if elm in group:
                    grouped.extend(group)
        metaGrouped.append(sorted(list(set(grouped))))
    newGroups = []
    for i in range(0, len(metaGrouped)):
        if metaGrouped[i][0] == i:
            newGroups.append(metaGrouped[i])
    return dfSub, newGroups, addedEdges
 def _n_best(self):
     """ Only consider the N most similar sentences to the centroid """
     sims = np.ravel([cossim(self.centroid, i) for i in self.submatrix])
     args = np.argsort(sims)[::-1]
     return self.submatrix[args[:self.N]]