Ejemplo n.º 1
0
    def make(self,
             event,
             prefix,
             feature_set,
             min_cluster_size=2,
             sim_threshold=.2264,
             center_threshold=1.0):
        tsv_path = self.get_tsv_path(event, prefix, feature_set,
                                     center_threshold, sim_threshold)
        lvecs = SentenceLatentVectorsResource()
        spa = SaliencePredictionAggregator()
        apsal = APSalienceSummarizer()
        cluster_df = apsal.get_dataframe(event, prefix, feature_set)
        #for _, row in cluster_df.iterrows():
        #    print row['hour'], datetime.utcfromtimestamp(row['timestamp'])
        updates = []
        Xcache = None
        timestamps = sorted(list(cluster_df['timestamp'].unique()))
        for timestamp in timestamps:
            hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1)
            lvec_df = lvecs.get_dataframe(event, hour)
            lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)
            sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)

            clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy()
            clusters.sort(['stream id', 'sentence id'], inplace=True)

            salience = []
            for _, row in clusters.iterrows():
                sal_pred = sal_df.loc[
                    (sal_df['stream id'] == row['stream id']) & \
                    (sal_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64).mean()
                salience.append(sal_pred)
            clusters['salience'] = salience
            #clusters.sort(['salie'], inplace=True)

            sal_mean = np.mean(salience)
            sal_std = np.std(salience)

            #print clusters
            for _, row in clusters.iterrows():
                if row['cluster size'] < min_cluster_size:
                    continue
                vec = lvec_df.loc[
                    (lvec_df['stream id'] == row['stream id']) & \
                    (lvec_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64)

                sal_norm = (row['salience'] - sal_mean) / sal_std
                if sal_norm < center_threshold:
                    continue

                if Xcache is None:
                    Xcache = vec
                else:
                    if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold:
                        continue
                    else:
                        Xcache = np.vstack((Xcache, vec))
                updates.append({
                    'query id':
                    event.query_id[5:],
                    'system id':
                    'cunlp',
                    'run id':
                    'apsal-time-ranked-sal_{}-sim_{}'.format(
                        center_threshold, sim_threshold),
                    'stream id':
                    row['stream id'],
                    'sentence id':
                    row['sentence id'],
                    'timestamp':
                    timestamp,
                    'conf':
                    row['salience'],
                    'string':
                    row['string']
                })

        print "Writing", tsv_path, "For ", center_threshold, sim_threshold
        df = pd.DataFrame(updates,
                          columns=[
                              "query id", "system id", "run id", "stream id",
                              "sentence id", "timestamp", "conf", "string"
                          ])
        with open(tsv_path, u'w') as f:
            df.to_csv(f,
                      sep='\t',
                      index=False,
                      index_label=False,
                      header=False)
Ejemplo n.º 2
0
    def make_summary(self, event, corpus, prefix, feature_set, cutoff):
        string_res = get_resource_manager(u'SentenceStringsResource')
        lvec_res = get_resource_manager(u'SentenceLatentVectorsResource')
        spa = SaliencePredictionAggregator()

        tsv_path = self.get_tsv_path(event, cutoff)
        updates = []
        epoch = datetime.utcfromtimestamp(0)

        for hour in event.list_event_hours():
            hp1 = hour + timedelta(hours=1)
            timestamp = str(int((hp1 - epoch).total_seconds()))

            string_df = string_res.get_dataframe(event, hour)
            lvec_df = lvec_res.get_dataframe(event, hour)
            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)

            if string_df is None or lvec_df is None or sal_df is None:
                continue

            string_df = string_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            lvec_df = lvec_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            sal_df = sal_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            string_df.sort([u"stream id", u"sentence id"], inplace=True)
            lvec_df.sort([u"stream id", u"sentence id"], inplace=True)
            sal_df.sort([u"stream id", u"sentence id"], inplace=True)

            X = lvec_df.as_matrix()[:, 2:].astype(np.float64)
            good_rows = np.where(X.any(axis=1))[0]
            string_df = string_df.iloc[good_rows]
            lvec_df = lvec_df.iloc[good_rows]
            sal_df = sal_df.iloc[good_rows]
            assert len(string_df) == len(lvec_df)
            assert len(string_df) == len(sal_df)

            n_sents = len(string_df)

            for i in xrange(n_sents):
                assert string_df[u'stream id'].iloc[i] == \
                    lvec_df[u'stream id'].iloc[i]
                assert string_df[u'stream id'].iloc[i] == \
                    sal_df[u'stream id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    lvec_df[u'sentence id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    sal_df[u'sentence id'].iloc[i]

            lvec_df.reset_index(drop=True, inplace=True)
            string_df.reset_index(drop=True, inplace=True)
            sal_df.reset_index(drop=True, inplace=True)
            good_rows = []

            for name, doc in string_df.groupby("stream id"):
                for rname, row in doc.iterrows():
                    scstring = row["streamcorpus"]
                    words = len(re.findall(r'\b[^\W\d_]+\b', scstring))
                    socs = len(
                        re.findall(
                            r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace',
                            scstring))
                    langs = len(
                        re.findall(r'Flash|JavaScript|CSS', scstring, re.I))

                    assert lvec_df.loc[rname][u'sentence id'] == \
                        row[u'sentence id']
                    assert lvec_df.loc[rname][u'stream id'] == \
                        row[u'stream id']

                    assert sal_df.loc[rname][u'sentence id'] == \
                        row[u'sentence id']
                    assert sal_df.loc[rname][u'stream id'] == \
                        row[u'stream id']

                    if words > 9 and len(doc) < 200 \
                        and socs < 2 and langs < 2:

                        good_rows.append(rname)

            lvec_df = lvec_df.loc[good_rows]
            string_df = string_df.loc[good_rows]
            sal_df = sal_df.loc[good_rows]
            n_sents = len(string_df)
            if n_sents < 10:
                continue

            for i in xrange(n_sents):
                assert string_df[u'stream id'].iloc[i] == \
                    lvec_df[u'stream id'].iloc[i]
                assert string_df[u'stream id'].iloc[i] == \
                    sal_df[u'stream id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    lvec_df[u'sentence id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    sal_df[u'sentence id'].iloc[i]

            X = lvec_df.as_matrix()[:, 2:].astype(np.float64)
            S = sal_df.as_matrix()[:, 2:].astype(np.float64)
            s = np.mean(S, axis=1)
            #Xn = Normalizer().fit_transform(X)
            z = fastcluster.linkage(X,
                                    method='single',
                                    metric='euclidean',
                                    preserve_input=True)
            clusters = hac.fcluster(z, cutoff, 'distance')
            II = np.arange(n_sents)
            #print set(clusters)
            for cluster_id, cluster in enumerate(set(clusters)):
                #    print cluster
                #    print (clusters == cluster).shape
                #    print II.shape
                ii = II[clusters == cluster]
                #print ii.shape
                C = X[clusters == cluster, :]

                u = np.mean(C, axis=0)
                dist_2 = np.sum((C - u)**2, axis=1)
                cidx = np.argmin(dist_2)
                #cidx = np.argmax(cosine_similarity(C, u))
                e = ii[cidx]
                #

                #Cs = s[clusters == cluster]
                #                e = ii[np.argmax(Cs)],
                cluster_size = C.shape[0]

                scstring = string_df.iloc[e][u'streamcorpus']
                stream_id = string_df.iloc[e][u'stream id']
                sentence_id = str(string_df.iloc[e][u'sentence id'])
                updates.append({
                    "stream id": stream_id,
                    "sentence id": sentence_id,
                    "hour": hour,
                    "timestamp": timestamp,
                    "cluster size": cluster_size,
                    "string": scstring
                })

        df = pd.DataFrame(updates,
                          columns=[
                              "stream id", "sentence id", "hour", "timestamp",
                              "cluster size", "string"
                          ])
        with gzip.open(tsv_path, u'w') as f:
            df.to_csv(f, sep='\t', index=False, index_label=False)
Ejemplo n.º 3
0
    def make(self, event, prefix, feature_set, sal_cutoff, sim_cutoff):
        tsv_path = self.get_tsv_path(event, sal_cutoff, sim_cutoff)
        lvecs = SentenceLatentVectorsResource()
        string_res = get_resource_manager(u'SentenceStringsResource')

        spa = SaliencePredictionAggregator()
        #cluster_df = hac.get_dataframe(event, dist_cutoff)
        #for _, row in cluster_df.iterrows():
        #    print row['hour'], datetime.utcfromtimestamp(row['timestamp'])
        epoch = datetime.utcfromtimestamp(0)
        updates = []
        Xcache = None
        #timestamps = sorted(list(cluster_df['timestamp'].unique()))
        hours = event.list_event_hours()
        for hour in hours:
            #hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1)
            hp1 = hour + timedelta(hours=1)
            timestamp = str(int((hp1 - epoch).total_seconds()))
            lvec_df = lvecs.get_dataframe(event, hour)
            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)
            str_df = string_res.get_dataframe(event, hour)
            if lvec_df is None or sal_df is None or str_df is None:
                continue
            str_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)

            lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
            sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)

            str_df.sort(['stream id', 'sentence id'], inplace=True)
            str_df.reset_index(drop=True, inplace=True)
            lvec_df.sort(['stream id', 'sentence id'], inplace=True)
            lvec_df.reset_index(drop=True, inplace=True)
            sal_df.sort(['stream id', 'sentence id'], inplace=True)
            sal_df.reset_index(drop=True, inplace=True)

            str_df = str_df.join(
                str_df.groupby('stream id')['sentence id'].agg('count'),
                on='stream id',
                rsuffix='_r').rename(
                    columns={"sentence id_r": "document count"})
            good_sents = str_df.apply(lambda x: passes_simple_filter(
                x['streamcorpus'], x['document count']),
                                      axis=1)
            #good_sents = good_sents.reset_index()

            str_df = str_df[good_sents]
            lvec_df = lvec_df[good_sents]
            sal_df = sal_df[good_sents]

            n_rows = len(sal_df)
            for i in xrange(n_rows):
                assert sal_df['stream id'].iloc[i] == \
                    lvec_df['stream id'].iloc[i]
                assert sal_df['sentence id'].iloc[i] == \
                    lvec_df['sentence id'].iloc[i]

                assert str_df['stream id'].iloc[i] == \
                    lvec_df['stream id'].iloc[i]
                assert str_df['sentence id'].iloc[i] == \
                    lvec_df['sentence id'].iloc[i]

            if n_rows == 0:
                continue

            Xsal = sal_df.as_matrix()[:, 2:].astype(np.float64).mean(axis=1)
            mu_sal = np.mean(Xsal)
            sig_sal = np.std(Xsal)
            Xsal_norm = (Xsal - mu_sal) / sig_sal

            lvec_df = lvec_df[Xsal_norm > sal_cutoff]
            str_df = str_df[Xsal_norm > sal_cutoff]
            str_df = str_df.set_index(['stream id', 'sentence id'])
            lvec_df['salience'] = Xsal_norm[Xsal_norm > sal_cutoff]
            lvec_df.sort(['salience'], inplace=True, ascending=False)
            if Xcache is None:
                Xlvecs = lvec_df.as_matrix()[:, 2:-2].astype(np.float64)

                K = cosine_similarity(Xlvecs)
                K_ma = np.ma.array(K, mask=True)
                good_indexes = []
                for i, (_, row) in enumerate(lvec_df.iterrows()):
                    sim = K_ma[i, :].max(fill_value=0.0)
                    if not isinstance(sim, np.float64):
                        sim = 0
                    if sim < sim_cutoff:
                        up_str = str_df.loc[row['stream id'],
                                            row['sentence id']]['streamcorpus']
                        updates.append({
                            'query id':
                            event.query_num,
                            'system id':
                            'cunlp',
                            'run id':
                            'sal-ranked-sal_{}-sim_{}'.format(
                                sal_cutoff, sim_cutoff),
                            'stream id':
                            row['stream id'],
                            'sentence id':
                            row['sentence id'],
                            'timestamp':
                            timestamp,
                            'conf':
                            row['salience'],
                            'string':
                            up_str
                        })
                        K_ma.mask[:, i] = False
                        good_indexes.append(i)

                Xcache = Xlvecs[good_indexes].copy()
            else:
                xtmp = lvec_df.as_matrix()[:, 2:-2].astype(np.float64)

                Xlvecs = np.vstack([Xcache, xtmp])

                start_index = Xcache.shape[0]
                K = cosine_similarity(Xlvecs)
                K_ma = np.ma.array(K, mask=True)
                K_ma.mask.T[np.arange(0, start_index)] = False
                good_indexes = []
                for i, (_, row) in enumerate(lvec_df.iterrows(), start_index):
                    #print i
                    sim = K_ma[i, :].max(fill_value=0.0)
                    #print sim
                    if sim < sim_cutoff:
                        up_str = str_df.loc[row['stream id'],
                                            row['sentence id']]['streamcorpus']
                        updates.append({
                            'query id':
                            event.query_num,
                            'system id':
                            'cunlp',
                            'run id':
                            'sal-ranked-sal_{}-sim_{}'.format(
                                sal_cutoff, sim_cutoff),
                            'stream id':
                            row['stream id'],
                            'sentence id':
                            row['sentence id'],
                            'timestamp':
                            timestamp,
                            'conf':
                            row['salience'],
                            'string':
                            up_str
                        })
                        K_ma.mask[:, i] = False
                        good_indexes.append(i)
                if len(good_indexes) > 0:
                    Xcache = np.vstack([Xcache, Xlvecs[good_indexes].copy()])
                #Xcache = Xlvecs[good_indexes].copy()

        if len(updates) == 0:
            updates.append({
                'query id':
                event.query_num,
                'system id':
                'cunlp',
                'run id':
                'sal-ranked-sal_{}-sim_{}'.format(sal_cutoff, sim_cutoff),
                'stream id':
                1111,
                'sentence id':
                1,
                'timestamp':
                timestamp,
                'conf':
                0,
                'string':
                'place holder'
            })

        df = pd.DataFrame(updates,
                          columns=[
                              "query id", "system id", "run id", "stream id",
                              "sentence id", "timestamp", "conf", "string"
                          ])
        with open(tsv_path, u'w') as f:
            df.to_csv(f,
                      sep='\t',
                      index=False,
                      index_label=False,
                      header=False)
Ejemplo n.º 4
0
Archivo: ap.py Proyecto: kedz/cuttsum
    def make_summary(self, event, corpus, prefix, feature_set):
        string_res = get_resource_manager(u'SentenceStringsResource')
        lvec_res = get_resource_manager(u'SentenceLatentVectorsResource')
        spa = SaliencePredictionAggregator()

        tsv_path = self.get_tsv_path(event, prefix, feature_set)
        updates = []
        epoch = datetime.utcfromtimestamp(0)
        for hour in event.list_event_hours():
            hp1 = hour + timedelta(hours=1)
            timestamp = str(int((hp1 - epoch).total_seconds()))

            string_df = string_res.get_dataframe(event, hour)     
            lvec_df = lvec_res.get_dataframe(event, hour)   
            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)   
            
            if string_df is None or lvec_df is None or sal_df is None:
                continue           

            string_df = string_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            lvec_df = lvec_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            sal_df = sal_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            string_df.sort([u"stream id", u"sentence id"], inplace=True)
            lvec_df.sort([u"stream id", u"sentence id"], inplace=True)
            sal_df.sort([u"stream id", u"sentence id"], inplace=True)

            X = lvec_df.as_matrix()[:,2:].astype(np.float64)    
            good_rows = np.where(X.any(axis=1))[0]
            string_df = string_df.iloc[good_rows]
            lvec_df = lvec_df.iloc[good_rows]
            sal_df = sal_df.iloc[good_rows]
            assert len(string_df) == len(lvec_df)
            assert len(string_df) == len(sal_df)
            
            n_sents = len(string_df)
                               
            for i in xrange(n_sents):
                assert string_df[u'stream id'].iloc[i] == \
                    lvec_df[u'stream id'].iloc[i]
                assert string_df[u'stream id'].iloc[i] == \
                    sal_df[u'stream id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    lvec_df[u'sentence id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    sal_df[u'sentence id'].iloc[i]

            lvec_df.reset_index(drop=True, inplace=True) 
            string_df.reset_index(drop=True, inplace=True) 
            sal_df.reset_index(drop=True, inplace=True) 

            
            

            string_df.drop_duplicates(subset=['streamcorpus'], inplace=True)
            string_df['update id'] = string_df['stream id'].map(str) + "-" + \
                string_df['sentence id'].map(str)
            good_uids = set(string_df['update id'].tolist())
            
            lvec_df['update id'] = lvec_df['stream id'].map(str) + "-" + \
                lvec_df['sentence id'].map(str)
            lvec_df = lvec_df[lvec_df['update id'].isin(good_uids)].copy()

            sal_df['update id'] = sal_df['stream id'].map(str) + "-" + \
                sal_df['sentence id'].map(str)
            sal_df = sal_df[sal_df['update id'].isin(good_uids)].copy()
            
            string_df.sort([u"stream id", u"sentence id"], inplace=True)
            lvec_df.sort([u"stream id", u"sentence id"], inplace=True)
            sal_df.sort([u"stream id", u"sentence id"], inplace=True)

            lvec_df.reset_index(drop=True, inplace=True) 
            string_df.reset_index(drop=True, inplace=True) 
            sal_df.reset_index(drop=True, inplace=True) 

            n_sents = len(string_df)
                               
            for i in xrange(n_sents):
                assert string_df[u'stream id'].iloc[i] == \
                    lvec_df[u'stream id'].iloc[i]
                assert string_df[u'stream id'].iloc[i] == \
                    sal_df[u'stream id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    lvec_df[u'sentence id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    sal_df[u'sentence id'].iloc[i]

          
            good_rows = []
            for name, doc in string_df.groupby("stream id"):
                for rname, row in doc.iterrows():
                    scstring = row["streamcorpus"]
                    words = len(re.findall(r'\b[^\W\d_]+\b', scstring))
                    socs = len(re.findall(
                        r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace',
                        scstring))  
                    langs = len(re.findall(
                        r'Flash|JavaScript|CSS', scstring, re.I))

                    assert lvec_df.loc[rname][u'sentence id'] == \
                        row[u'sentence id']
                    assert lvec_df.loc[rname][u'stream id'] == \
                        row[u'stream id']

                    assert sal_df.loc[rname][u'sentence id'] == \
                        row[u'sentence id']
                    assert sal_df.loc[rname][u'stream id'] == \
                        row[u'stream id']

                    if words > 9 and len(doc) < 200 \
                        and socs < 2 and langs < 2:
                        
                        good_rows.append(rname)
            
            lvec_df = lvec_df.loc[good_rows]
            string_df = string_df.loc[good_rows]
            sal_df = sal_df.loc[good_rows]
            n_sents = len(string_df)
            if n_sents < 10:
                continue

            for i in xrange(n_sents):
                assert string_df[u'stream id'].iloc[i] == \
                    lvec_df[u'stream id'].iloc[i]
                assert string_df[u'stream id'].iloc[i] == \
                    sal_df[u'stream id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    lvec_df[u'sentence id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    sal_df[u'sentence id'].iloc[i]

            del lvec_df['update id'] 
            del sal_df['update id'] 
            X = lvec_df.as_matrix()[:,2:].astype(np.float64) 
            S = sal_df.as_matrix()[:,2:].astype(np.float64)
            s = np.mean(S, axis=1)
            A = cosine_similarity(X)
            Aupper = A[np.triu_indices_from(A, k=1)]
            Amu = np.mean(Aupper)
            Astd = np.std(Aupper)

            A = (A - Amu) / Astd
            

            max_sim = np.max(
                (np.max(A[np.triu_indices_from(A, k=1)]), np.max(s)))
            A = A - max_sim
            P = s - max_sim
            
            
            #P = MinMaxScaler(feature_range=(-9, -5)).fit_transform(s)
            #A = MinMaxScaler(feature_range=(-3, -1)).fit_transform(A)
            
            #std_s = StandardScaler().fit_transform(s)
            #assert X.shape[0] == s.shape[0]

            #period = (((hour + timedelta(hours=6)) - \
            #    event.start).total_seconds() // (6 * 3600))
            #cutoff = 2. * period / (1. + period)
            af = AffinityPropagation(
                preference=P, affinity='precomputed', max_iter=500,
                damping=.7, verbose=False).fit(A)

            if af.cluster_centers_indices_ is None:
                continue
            II = np.arange(n_sents)
            for cnum, cluster in enumerate(np.unique(af.labels_)):
                e = af.cluster_centers_indices_[cluster]
                cluster_size = II[cluster == af.labels_].shape[0]

                scstring = string_df.iloc[e][u'streamcorpus']
                stream_id = string_df.iloc[e][u'stream id']
                sentence_id = str(string_df.iloc[e][u'sentence id'])
                updates.append({"stream id": stream_id,
                                "sentence id": sentence_id,
                                "hour": hour, 
                                "timestamp": timestamp,
                                "cluster size": cluster_size,
                                "string": scstring})
        df = pd.DataFrame(updates, 
            columns=["stream id", "sentence id", "hour", "timestamp",
                     "cluster size", "string"])
        with gzip.open(tsv_path, u'w') as f:
            df.to_csv(f, sep='\t', index=False, index_label=False)
Ejemplo n.º 5
0
    def make_summary(self, event, corpus, prefix, feature_set, cutoff):
        string_res = get_resource_manager(u'SentenceStringsResource')
        lvec_res = get_resource_manager(u'SentenceLatentVectorsResource')
        spa = SaliencePredictionAggregator()

        tsv_path = self.get_tsv_path(event, cutoff)
        updates = []
        epoch = datetime.utcfromtimestamp(0)

        for hour in event.list_event_hours():
            hp1 = hour + timedelta(hours=1)
            timestamp = str(int((hp1 - epoch).total_seconds()))

            string_df = string_res.get_dataframe(event, hour)
            lvec_df = lvec_res.get_dataframe(event, hour)      
            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)

            if string_df is None or lvec_df is None or sal_df is None:
                continue           

            string_df = string_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            lvec_df = lvec_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            sal_df = sal_df.drop_duplicates(
                subset=[u'stream id', u'sentence id'])

            string_df.sort([u"stream id", u"sentence id"], inplace=True)
            lvec_df.sort([u"stream id", u"sentence id"], inplace=True)
            sal_df.sort([u"stream id", u"sentence id"], inplace=True)

            X = lvec_df.as_matrix()[:,2:].astype(np.float64)
            good_rows = np.where(X.any(axis=1))[0]
            string_df = string_df.iloc[good_rows]
            lvec_df = lvec_df.iloc[good_rows]
            sal_df = sal_df.iloc[good_rows]
            assert len(string_df) == len(lvec_df)
            assert len(string_df) == len(sal_df)

            n_sents = len(string_df)

            for i in xrange(n_sents):
                assert string_df[u'stream id'].iloc[i] == \
                    lvec_df[u'stream id'].iloc[i]
                assert string_df[u'stream id'].iloc[i] == \
                    sal_df[u'stream id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    lvec_df[u'sentence id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    sal_df[u'sentence id'].iloc[i]

            lvec_df.reset_index(drop=True, inplace=True)
            string_df.reset_index(drop=True, inplace=True)
            sal_df.reset_index(drop=True, inplace=True)
            good_rows = []

            for name, doc in string_df.groupby("stream id"):
                for rname, row in doc.iterrows():
                    scstring = row["streamcorpus"]
                    words = len(re.findall(r'\b[^\W\d_]+\b', scstring))
                    socs = len(re.findall(
                        r'Digg|del\.icio\.us|Facebook|Kwoff|Myspace',
                        scstring))
                    langs = len(re.findall(
                        r'Flash|JavaScript|CSS', scstring, re.I))

                    assert lvec_df.loc[rname][u'sentence id'] == \
                        row[u'sentence id']
                    assert lvec_df.loc[rname][u'stream id'] == \
                        row[u'stream id']

                    assert sal_df.loc[rname][u'sentence id'] == \
                        row[u'sentence id']
                    assert sal_df.loc[rname][u'stream id'] == \
                        row[u'stream id']

                    if words > 9 and len(doc) < 200 \
                        and socs < 2 and langs < 2:

                        good_rows.append(rname)

            lvec_df = lvec_df.loc[good_rows]
            string_df = string_df.loc[good_rows]
            sal_df = sal_df.loc[good_rows]
            n_sents = len(string_df)
            if n_sents < 10:
                continue

            for i in xrange(n_sents):
                assert string_df[u'stream id'].iloc[i] == \
                    lvec_df[u'stream id'].iloc[i]
                assert string_df[u'stream id'].iloc[i] == \
                    sal_df[u'stream id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    lvec_df[u'sentence id'].iloc[i]
                assert string_df[u'sentence id'].iloc[i] == \
                    sal_df[u'sentence id'].iloc[i]

            X = lvec_df.as_matrix()[:,2:].astype(np.float64)
            S = sal_df.as_matrix()[:,2:].astype(np.float64)
            s = np.mean(S, axis=1)
            #Xn = Normalizer().fit_transform(X)
            z = fastcluster.linkage(X, 
                method='single', metric='euclidean', preserve_input=True)      
            clusters = hac.fcluster(z, cutoff, 'distance') 
            II = np.arange(n_sents)
            #print set(clusters)
            for cluster_id, cluster in enumerate(set(clusters)):
            #    print cluster
            #    print (clusters == cluster).shape
            #    print II.shape
                ii = II[clusters == cluster]
                #print ii.shape
                C = X[clusters == cluster,:]

                u = np.mean(C, axis=0)
                dist_2 = np.sum((C - u)**2, axis=1)
                cidx = np.argmin(dist_2)
                #cidx = np.argmax(cosine_similarity(C, u))
                e = ii[cidx]
#
                                
#Cs = s[clusters == cluster]
#                e = ii[np.argmax(Cs)],
                cluster_size = C.shape[0]

                scstring = string_df.iloc[e][u'streamcorpus']
                stream_id = string_df.iloc[e][u'stream id']
                sentence_id = str(string_df.iloc[e][u'sentence id'])
                updates.append({"stream id": stream_id,
                                "sentence id": sentence_id,
                                "hour": hour,
                                "timestamp": timestamp,
                                "cluster size": cluster_size,
                                "string": scstring})

        df = pd.DataFrame(updates,
            columns=["stream id", "sentence id", "hour", "timestamp",
                     "cluster size", "string"])
        with gzip.open(tsv_path, u'w') as f:
            df.to_csv(f, sep='\t', index=False, index_label=False)        
Ejemplo n.º 6
0
    def make(self, event, prefix, feature_set, 
             min_cluster_size=2, sim_threshold=.2264, center_threshold=1.0):
        tsv_path = self.get_tsv_path(event, prefix, feature_set,
            center_threshold, sim_threshold)
        lvecs = SentenceLatentVectorsResource()
        spa = SaliencePredictionAggregator()
        apsal = APSalienceSummarizer()
        cluster_df = apsal.get_dataframe(event, prefix, feature_set)
       #for _, row in cluster_df.iterrows():
       #    print row['hour'], datetime.utcfromtimestamp(row['timestamp']) 
        updates = []
        Xcache = None
        timestamps = sorted(list(cluster_df['timestamp'].unique()))
        for timestamp in timestamps:
            hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1)
            lvec_df = lvecs.get_dataframe(event, hour)
            lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)            
            sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)

            clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy()
            clusters.sort(['stream id', 'sentence id'], inplace=True)

            salience = []
            for _, row in clusters.iterrows():
                sal_pred = sal_df.loc[
                    (sal_df['stream id'] == row['stream id']) & \
                    (sal_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64).mean()
                salience.append(sal_pred)
            clusters['salience'] = salience
            #clusters.sort(['salie'], inplace=True)

            sal_mean = np.mean(salience)
            sal_std = np.std(salience)


            #print clusters
            for _, row in clusters.iterrows():
                if row['cluster size'] < min_cluster_size:
                    continue   
                vec = lvec_df.loc[
                    (lvec_df['stream id'] == row['stream id']) & \
                    (lvec_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64)

                sal_norm = (row['salience'] - sal_mean) / sal_std
                if sal_norm < center_threshold:
                    continue  

                if Xcache is None:
                    Xcache = vec
                else:
                    if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold:
                        continue
                    else:
                        Xcache = np.vstack((Xcache, vec))
                updates.append({
                    'query id': event.query_id[5:],
                    'system id': 'cunlp',
                    'run id': 'apsal-time-ranked-sal_{}-sim_{}'.format(
                        center_threshold, sim_threshold),
                    'stream id': row['stream id'], 
                    'sentence id': row['sentence id'],
                    'timestamp': timestamp,
                    'conf': row['salience'],
                    'string': row['string']
                })

        print "Writing", tsv_path, "For ", center_threshold, sim_threshold
        df = pd.DataFrame(updates,
            columns=["query id", "system id", "run id",
                     "stream id", "sentence id", "timestamp", 
                     "conf", "string"])
        with open(tsv_path, u'w') as f:
            df.to_csv(
                f, sep='\t', index=False, index_label=False, header=False)
Ejemplo n.º 7
0
    def make(self, event, prefix, feature_set, sal_cutoff, sim_cutoff):
        tsv_path = self.get_tsv_path(event, sal_cutoff, sim_cutoff)
        lvecs = SentenceLatentVectorsResource()
        string_res = get_resource_manager(u'SentenceStringsResource')

        spa = SaliencePredictionAggregator()
        #cluster_df = hac.get_dataframe(event, dist_cutoff)
       #for _, row in cluster_df.iterrows():
       #    print row['hour'], datetime.utcfromtimestamp(row['timestamp']) 
        epoch = datetime.utcfromtimestamp(0)
        updates = []
        Xcache = None
        #timestamps = sorted(list(cluster_df['timestamp'].unique()))
        hours = event.list_event_hours()
        for hour in hours:
            #hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1)
            hp1 = hour + timedelta(hours=1)
            timestamp = str(int((hp1 - epoch).total_seconds()))
            lvec_df = lvecs.get_dataframe(event, hour)
            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)
            str_df = string_res.get_dataframe(event, hour)
            if lvec_df is None or sal_df is None or str_df is None:
                continue
            str_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
                
            lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
            sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
            
            
            str_df.sort(['stream id', 'sentence id'], inplace=True)
            str_df.reset_index(drop=True, inplace=True)
            lvec_df.sort(['stream id', 'sentence id'], inplace=True)
            lvec_df.reset_index(drop=True, inplace=True)
            sal_df.sort(['stream id', 'sentence id'], inplace=True)
            sal_df.reset_index(drop=True, inplace=True)

            str_df = str_df.join(
                str_df.groupby('stream id')['sentence id'].agg('count'), 
                on='stream id', rsuffix='_r').rename(
                    columns={"sentence id_r": "document count"})
            good_sents = str_df.apply(
                lambda x: passes_simple_filter(
                    x['streamcorpus'], x['document count']), axis=1)
            #good_sents = good_sents.reset_index()

            str_df = str_df[good_sents] 
            lvec_df = lvec_df[good_sents] 
            sal_df = sal_df[good_sents]
             
            n_rows = len(sal_df)
            for i in xrange(n_rows):
                assert sal_df['stream id'].iloc[i] == \
                    lvec_df['stream id'].iloc[i]
                assert sal_df['sentence id'].iloc[i] == \
                    lvec_df['sentence id'].iloc[i]
                
                assert str_df['stream id'].iloc[i] == \
                    lvec_df['stream id'].iloc[i]
                assert str_df['sentence id'].iloc[i] == \
                    lvec_df['sentence id'].iloc[i]
             
            if n_rows == 0:
                continue

            Xsal = sal_df.as_matrix()[:,2:].astype(np.float64).mean(axis=1)
            mu_sal = np.mean(Xsal)
            sig_sal = np.std(Xsal)
            Xsal_norm = (Xsal - mu_sal) / sig_sal
            
            lvec_df = lvec_df[Xsal_norm > sal_cutoff]
            str_df = str_df[Xsal_norm > sal_cutoff]
            str_df = str_df.set_index(['stream id', 'sentence id'])
            lvec_df['salience'] = Xsal_norm[Xsal_norm > sal_cutoff]
            lvec_df.sort(['salience'], inplace=True, ascending=False)
            if Xcache is None:
                Xlvecs = lvec_df.as_matrix()[:, 2:-2].astype(np.float64)

                K = cosine_similarity(Xlvecs)
                K_ma = np.ma.array(K, mask=True)
                good_indexes = []
                for i, (_, row) in enumerate(lvec_df.iterrows()):
                    sim = K_ma[i,:].max(fill_value=0.0)    
                    if not isinstance(sim, np.float64):
                        sim = 0
                    if sim < sim_cutoff:
                        up_str = str_df.loc[
                            row['stream id'],
                            row['sentence id']]['streamcorpus']
                        updates.append({
                            'query id': event.query_num,
                            'system id': 'cunlp',
                            'run id': 'sal-ranked-sal_{}-sim_{}'.format(
                                sal_cutoff, sim_cutoff),
                            'stream id': row['stream id'], 
                            'sentence id': row['sentence id'],
                            'timestamp': timestamp,
                            'conf': row['salience'],
                            'string': up_str}) 
                        K_ma.mask[:,i] = False
                        good_indexes.append(i)
                    
                Xcache = Xlvecs[good_indexes].copy()
            else:
                xtmp = lvec_df.as_matrix()[:, 2:-2].astype(np.float64)

                Xlvecs = np.vstack([Xcache, xtmp])
                
                start_index = Xcache.shape[0]
                K = cosine_similarity(Xlvecs)
                K_ma = np.ma.array(K, mask=True)
                K_ma.mask.T[np.arange(0, start_index)] = False
                good_indexes = []
                for i, (_, row) in enumerate(lvec_df.iterrows(), start_index):
                    #print i
                    sim = K_ma[i,:].max(fill_value=0.0)    
                    #print sim
                    if sim < sim_cutoff:
                        up_str = str_df.loc[
                            row['stream id'],
                            row['sentence id']]['streamcorpus']
                        updates.append({
                            'query id': event.query_num,
                            'system id': 'cunlp',
                            'run id': 'sal-ranked-sal_{}-sim_{}'.format(
                                sal_cutoff, sim_cutoff),
                            'stream id': row['stream id'], 
                            'sentence id': row['sentence id'],
                            'timestamp': timestamp,
                            'conf': row['salience'],
                            'string': up_str}) 
                        K_ma.mask[:,i] = False
                        good_indexes.append(i)
                if len(good_indexes) > 0:
                    Xcache = np.vstack([Xcache, Xlvecs[good_indexes].copy()])
                #Xcache = Xlvecs[good_indexes].copy()

        if len(updates) == 0:
            updates.append({
                'query id': event.query_num,
                'system id': 'cunlp',
                'run id': 'sal-ranked-sal_{}-sim_{}'.format(
                    sal_cutoff, sim_cutoff),
                'stream id': 1111, 
                'sentence id': 1,
                'timestamp': timestamp,
                'conf': 0,
                'string': 'place holder'}) 

        df = pd.DataFrame(updates,
            columns=["query id", "system id", "run id",
                     "stream id", "sentence id", "timestamp", 
                     "conf", "string"])
        with open(tsv_path, u'w') as f:
            df.to_csv(
                f, sep='\t', index=False, index_label=False, header=False)