def _get_h1_tags_to_merge(related_pairs):
        l = copy.deepcopy(related_pairs)

        def to_graph(l):
            G = networkx.Graph()
            for part in l:
                # each sublist is a bunch of nodes
                G.add_nodes_from(part)
                # it also imlies a number of edges:
                G.add_edges_from(to_edges(part))
            return G

        def to_edges(l):
            """
                treat `l` as a Graph and returns it's edges
                to_edges(['a','b','c','d']) -> [(a,b), (b,c),(c,d)]
            """
            it = iter(l)
            last = next(it)

            for current in it:
                yield last, current
                last = current

        G = to_graph(l)
        return [list(l) for l in connected_components(G)]
def get_connections(coordinates, distance=2, tree_type='KDTree'):
    """Get spots which are conditionally connected

    Parameters
    ----------
    coordinates : array of tuples
    distance : float
    tree_type : str

    Returns
    -------

    """
    if tree_type == 'KDTree':
        nn_points = kdtree_clustering(coordinates=coordinates,
                                      distance=distance)
        graph_tree = to_graph(nn_points)
        return sorted(connected_components(graph_tree), key=len, reverse=True)
    else:
        # USE cKDTree to determine connected cyto+ spots with a distance of d <= 2
        conneted_spots = get_connectedspots(coordinates=np.array(coordinates),
                                            distance=1.5)
        lines = [[np.array(coordinates)[i],
                  np.array(coordinates)[j]] for i, j in conneted_spots]
        return conneted_spots, lines
Beispiel #3
0
    def getCIGroups(local_data, ds_context=None, scope=None, families=None):
        """
        :param local_data: np array
        :param scope: a list of index to output variables
        :param alpha: threshold
        :param families: obsolete
        :return: np array of clustering

        This function take tuple (output, conditional) as input and returns independent groups
        alpha is the cutoff parameter for connected components
        BE CAREFUL WITH SPARSE DATA!
        """

        # data = preproc(local_data, ds_context, None, ohe)

        y, x = get_YX(local_data, ds_context.feature_size)

        pvals = testRcoT(y, x) + epsilon

        pvals[pvals > alpha] = 0

        clusters = np.zeros(y.shape[1])
        for i, c in enumerate(connected_components(from_numpy_matrix(pvals))):
            clusters[list(c)] = i + 1

        return split_conditional_data_by_clusters(y,
                                                  x,
                                                  clusters,
                                                  scope,
                                                  rows=False)
Beispiel #4
0
    def getCIGroups(local_data, ds_context=None, scope=None, alpha=0.001, families=None):
        """
        :param local_data: np array
        :param scope: a list of index to output variables
        :param alpha: threshold
        :param families: obsolete
        :return: np array of clustering

        This function take tuple (output, conditional) as input and returns independent groups
        alpha is the cutoff parameter for connected components
        BE CAREFUL WITH SPARSE DATA!
        """

        data = preproc(local_data, ds_context, None, ohe)

        num_instance = data.shape[0]

        output_mask = np.zeros(data.shape, dtype=bool)  # todo check scope and node.scope again
        output_mask[:, np.arange(len(scope))] = True

        dataOut = data[output_mask].reshape(num_instance, -1)
        dataIn = data[~output_mask].reshape(num_instance, -1)

        assert len(dataIn) > 0
        assert len(dataOut) > 0

        pvals = testRcoT(dataOut, dataIn)

        pvals[pvals > alpha] = 0

        clusters = np.zeros(dataOut.shape[1])
        for i, c in enumerate(connected_components(from_numpy_matrix(pvals))):
            clusters[list(c)] = i + 1

        return split_conditional_data_by_clusters(local_data, clusters, scope, rows=False)
Beispiel #5
0
def find_mentions(entities):
    """
    Find unique entities and their mentions
    Args:
        entities: (dic) a struct for each entity
    Returns: (dic) unique entities based on their grounded ID, if -1 ID=UNK:No
    """
    equivalents = []
    for e in entities:
        if e.kb_id not in equivalents:
            equivalents.append(e.kb_id)

    # mention-level data sets
    g = to_graph(equivalents)
    cc = connected_components(g)

    unique_entities = OrderedDict()
    unk_id = 0
    for c in cc:
        if tuple(c)[0] == '-1':
            continue
        unique_entities[tuple(c)] = []

    # consider non-grounded entities as separate entities
    for e in entities:
        if e.kb_id[0] == '-1':
            unique_entities[tuple(('UNK:' + str(unk_id),))] = [e]
            unk_id += 1
        else:
            for ue in unique_entities.keys():
                if list(set(e.kb_id).intersection(set(ue))):
                    unique_entities[ue] += [e]

    return unique_entities
Beispiel #6
0
def idx_cleanboxes(boxes, scores, second_cutoff=0.83):
    '''
    boxes: 2d npy containing all boxes in the image
    socres: 1d npy containing score for each box in boxes
    '''
    df = pd.DataFrame(boxes)
    df.columns = ['x0', 'y0', 'x1', 'y1']
    df['score'] = scores

    clusters = []
    for box1 in df.itertuples():
        cluster = {box1.Index}
        df2 = df.iloc[box1.Index + 1:, :]
        for box2 in df2.itertuples():
            if _area(box1, box2):
                cluster.add(box2.Index)
        clusters.append(cluster)
    G = _to_graph(clusters)

    final_index = []
    for c in connected_components(G):
        c = list(c)
        scores = df.loc[c, 'score']
        cdas = scores[scores >= second_cutoff]
        cdas_idx = cdas.index.to_list()
        if len(cdas_idx) > 0:
            final_index.extend(cdas_idx)
        else:
            final_index.append(scores.idxmax())
    return final_index
Beispiel #7
0
def getIndependentRDCGroups_py(data_slice,
                               threshold,
                               k=None,
                               s=1. / 6.,
                               non_linearity=numpy.sin,
                               n_jobs=1,
                               rand_gen=None):

    rdc_adjacency_matrix = rdc_test(data_slice,
                                    k=k,
                                    s=s,
                                    non_linearity=non_linearity,
                                    n_jobs=n_jobs,
                                    rand_gen=rand_gen)

    n_features = len(data_slice.cols)

    #
    # thresholding
    rdc_adjacency_matrix[rdc_adjacency_matrix < threshold] = 0
    #print("thresholding", rdc_adjacency_matrix)

    #
    # getting connected components
    result = numpy.zeros(n_features)
    for i, c in enumerate(connected_components(from_numpy_matrix(rdc_adjacency_matrix))):
        result[list(c)] = i + 1

    return result
Beispiel #8
0
def compute_overlapping_pairs(
        candidate_matching_pairs):  #A connected component problem.
    G = nx.Graph()
    G.add_edges_from(
        candidate_matching_pairs)  #nodes are specific article files
    overlapping_matching_articles = connected_components(G)
    #print list(overlapping_matching_articles)
    return list(overlapping_matching_articles)
def pairwise_connectivity(G):
    components = connected.connected_components(G)
    result = 0

    for component in components:
        n = len(component)
        result += (n * (n - 1)) // 2

    return result
def assign_clusters():
    '''
    This function does the actual donor assignment, assigning unique donor IDs in the contribution table
    based on clusters of contributions that appear to have the same donor. It works pretty much the same
    as the mark_matches function above.
    '''
    # Again, instantiate and train our classifier
    clf = RandomForestClassifier(n_estimators=10, random_state=0)
    clf = clf.fit([eval(t.features) for t in TRAINING_DATA],
                  [int(t.same) for t in TRAINING_DATA])

    # Loop through the last name groups
    print 'Processing groups ...'
    for g in Contribution.objects.all().values('group_id').distinct():
        if not g['group_id']: continue
        toupdate = []
        G = nx.Graph(
        )  # Create an empty network graph for each last name group
        # We're using a simple hash function to help generate unique donor IDs
        nameid = hashlib.sha224(str(g['group_id'])).hexdigest()
        # For each match in a last name group
        for m in Match.objects.filter(c1__group_id=g['group_id']):
            # Do the two contributions have the same donor? Same as above.
            edge = clf.predict_proba(eval(m.features))
            if edge[0][1] > edge[0][0]:
                # If they do, add an edge between those contributions in the network graph we created
                # a few steps ago. This process is outlined in the steps here:
                # https://github.com/cjdd3b/fec-standardizer/wiki/Matching-donors
                G.add_edge(m.c1, m.c2)

        # Now we want to go through the graph we created and basically find all the contributions that are
        # connected. If the contributions were connected in the step above, that means they're probably from
        # the same donor. So a donor is basically defined by small networks of connected contributions. This
        # is described further here: https://github.com/cjdd3b/fec-standardizer/wiki/Defining-donor-clusters
        ccs = connected_components(G)

        # Now loop through each of the donor clusters generated by the connected_components function
        for c in enumerate(ccs):
            donor_id = c[0]
            for i in c[1]:
                # Create a donor ID based on our group hash above and the enumerated cluster number
                classifier_id = '%s%s' % (donor_id, nameid)
                i.classifier_id = classifier_id[:12]
                toupdate.append(i)
        # Bulk save the donor IDs to the contribution table
        commit_saves(toupdate)

    print 'Cleaning up the leftovers ...'
    tocleanup = []
    for record in Contribution.objects.filter(classifier_id__isnull=True):
        if not record.match_repr: continue
        classifier_id = '99%s' % hashlib.sha224(record.match_repr).hexdigest()
        record.classifier_id = classifier_id[:12]
        tocleanup.append(record)
    commit_saves(tocleanup)
    return
Beispiel #11
0
def to_graph(l):
    ''' Including networkx connected components alogorithm
    '''
    G = networkx.Graph()
    for part in l:
        # each sublist is a bunch of nodes
        G.add_nodes_from(part)
        # it also implies a number of edges:
        G.add_edges_from(to_edges(part))
    return list(connected_components(G))
Beispiel #12
0
def get_unidirectional_scheme(df, threshold=5):
    new_arr = copy.copy(df.values)
    new_arr = np.where(new_arr > threshold, new_arr, 0)
    class_names = list(df.index)
    graph = nx.from_numpy_matrix(new_arr, create_using=nx.Graph)
    unidirectional_chains = []
    for elem in connected_components(graph):
        if len(elem) != 1:
            unidirectional_chains.append(list(map(lambda x: class_names[x], elem)))
    return get_reorg_dict(unidirectional_chains)
Beispiel #13
0
 def one_shot_agglomeration(self, threshold=0.5):
     g = self.copy()
     if len(g.merge_queue) == 0:
         g.rebuild_merge_queue()
     for u, v, d in g.edges(data=True):
         if g.boundary_body in [u,v] or d['weight'] > threshold:
             g.remove_edge(u, v)
     ccs = connected_components(g)
     for cc in ccs:
         g.merge_subgraph(cc)
     return g.get_segmentation()
Beispiel #14
0
 def one_shot_agglomeration(self, threshold=0.5):
     g = self.copy()
     if len(g.merge_queue) == 0:
         g.rebuild_merge_queue()
     for u, v, d in g.edges(data=True):
         if g.boundary_body in [u, v] or d['weight'] > threshold:
             g.remove_edge(u, v)
     ccs = connected_components(g)
     for cc in ccs:
         g.merge_subgraph(cc)
     return g.get_segmentation()
def assign_clusters():
    '''
    This function does the actual donor assignment, assigning unique donor IDs in the contribution table
    based on clusters of contributions that appear to have the same donor. It works pretty much the same
    as the mark_matches function above.
    '''
    # Again, instantiate and train our classifier
    clf = RandomForestClassifier(n_estimators=10, random_state=0)
    clf = clf.fit([eval(t.features) for t in TRAINING_DATA], [int(t.same) for t in TRAINING_DATA])

    # Loop through the last name groups
    print 'Processing groups ...'
    for g in Contribution.objects.all().values('group_id').distinct():
        if not g['group_id']: continue
        toupdate = []
        G = nx.Graph() # Create an empty network graph for each last name group
        # We're using a simple hash function to help generate unique donor IDs
        nameid = hashlib.sha224(str(g['group_id'])).hexdigest()
        # For each match in a last name group
        for m in Match.objects.filter(c1__group_id=g['group_id']):
            # Do the two contributions have the same donor? Same as above.
            edge = clf.predict_proba(eval(m.features))
            if edge[0][1] > edge[0][0]:
                # If they do, add an edge between those contributions in the network graph we created
                # a few steps ago. This process is outlined in the steps here:
                # https://github.com/cjdd3b/fec-standardizer/wiki/Matching-donors
                G.add_edge(m.c1, m.c2)

        # Now we want to go through the graph we created and basically find all the contributions that are
        # connected. If the contributions were connected in the step above, that means they're probably from
        # the same donor. So a donor is basically defined by small networks of connected contributions. This
        # is described further here: https://github.com/cjdd3b/fec-standardizer/wiki/Defining-donor-clusters
        ccs = connected_components(G)

        # Now loop through each of the donor clusters generated by the connected_components function
        for c in enumerate(ccs):
            donor_id = c[0]
            for i in c[1]:
                # Create a donor ID based on our group hash above and the enumerated cluster number
                classifier_id = '%s%s' % (donor_id, nameid)
                i.classifier_id = classifier_id[:12]
                toupdate.append(i)
        # Bulk save the donor IDs to the contribution table
        commit_saves(toupdate)

    print 'Cleaning up the leftovers ...'
    tocleanup = []
    for record in Contribution.objects.filter(classifier_id__isnull=True):
        if not record.match_repr: continue
        classifier_id = '99%s' % hashlib.sha224(record.match_repr).hexdigest()
        record.classifier_id = classifier_id[:12]
        tocleanup.append(record)
    commit_saves(tocleanup)
    return
    def get_result(self):
        list_connections = [
            set(re.findall(r"[\w']+", line.strip()))
            for line in self.input_content.split('\n')
        ]
        graph = self.to_graph(list_connections)

        list_groups = list(connected_components(graph))
        if not self.get_groups:
            list_groups = list(subset for subset in list_groups
                               if '0' in subset)[0]
        return list_groups
Beispiel #17
0
def compute_overlapping_pairs(candidate_pairs, published, sources):
    print("Overlapping pairs")
    G = nx.Graph()
    G.add_weighted_edges_from(candidate_pairs)
    nx.set_node_attributes(G, published, name="published")
    cc = connected_components(G)

    selected_pairs = list()
    for c in cc:
        selected_pairs.extend(select_most_correct_pairs(c, G, sources))

    return selected_pairs
Beispiel #18
0
def cluster(dupes, threshold=.5):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    '''

    threshold = 1 - threshold

    score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)]

    dupe_graph = networkx.Graph()
    dupe_graph.add_weighted_edges_from((x[0], x[1], y) for (x, y) in dupes)

    dupe_sub_graphs = connected_components(dupe_graph)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 2:
            pair_gen = ((x[0:2], x[2]['weight'])
                        for x in dupe_graph.edges_iter(sub_graph, data=True))

            pairs = numpy.fromiter(pair_gen, dtype=score_dtype)

            (i_to_id, condensed_distances) = condensedDistance(pairs)
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage,
                                          threshold,
                                          criterion='distance')

            for (i, sub_cluster_id) in enumerate(partition):
                clustering.setdefault(cluster_id + sub_cluster_id,
                                      []).append(i_to_id[i])

            cluster_id += max(partition)
        else:

            clustering[cluster_id] = sub_graph
            cluster_id += 1

    clusters = [set(l) for l in clustering.values() if len(l) > 1]

    return clusters
Beispiel #19
0
def get_connected_components(frames):
    """ Take the frames at each level interval and calculate connected 
    components."""
    f_maxes = frames.max(axis=(1,2))
    # Relabel frames so that object numbers are unique across frames
    for i in range(1,frames.shape[0]):
            frames[i,frames[i]>0] += np.cumsum(f_maxes[:-1])[i-1]

    # Create graph for cells that overlap at different vertical levels.
    overlap_graph = networkx.Graph()
    total_objs = frames[-1].max()
    overlap_graph.add_nodes_from(set(range(1, total_objs)))

    # Create edges between the objects that overlap vertically.
    for i in range(frames.shape[0]-1):
        # Determine the objects in frame i.
        objects = set(frames[i][frames[i]>0])
        # Determine the objects in frame i+1.
        objects_next = set(frames[i+1][frames[i+1]>0])
        for j in range(len(list(objects))):
            overlap = np.logical_and(frames[i] == list(objects)[j], 
                                     frames[i+1] > 0)
            overlap_objs = set((frames[i+1][overlap]).flatten())
            # If objects overlap, add edge between object j and first 
            # object from overlap set
            if bool(overlap_objs):
                overlap_graph.add_edges_from(
                    [(list(objects)[j], list(overlap_objs)[0])]
                )
                # Add edges between objects in overlap set
                for k in range(0, len(list(overlap_objs))-1):
                    overlap_graph.add_edges_from(
                        [(list(overlap_objs)[k], list(overlap_objs)[k+1])]
                    )  
    
    # Create new objects based on connected components 
    new_objs = list(connected_components(overlap_graph))
    frames_con = np.zeros(frames.shape, dtype=int)
    for i in range(len(new_objs)):
        frames_con[np.isin(frames, list(new_objs[i]))] = i + 1

    # Require that objects be present in all vertical level intervals    
    new_objs = list(set(frames_con[frames_con>0].flatten()))
    object_counter = 1
    for i in range(len(new_objs)):
        if np.all(np.any(frames_con == new_objs[i], axis=(1,2))):
            frames_con[frames_con == new_objs[i]] = object_counter
            object_counter += 1
        else:
            frames_con[frames_con == new_objs[i]] = 0

    return frames_con, frames
Beispiel #20
0
def cluster(dupes, threshold=.5):
    """
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    """
    threshold = 1 - threshold

    score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)]


    dupe_graph = networkx.Graph()
    dupe_graph.add_weighted_edges_from(((x[0], x[1], y) for x, y in dupes))
    del dupes

    dupe_sub_graphs = connected_components(dupe_graph)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs :
        if len(sub_graph) > 2 :
            pair_gen = ((x[0:2], x[2]['weight'])
                        for x
                        in dupe_graph.edges_iter(sub_graph, data=True))
                
            pairs = numpy.fromiter(pair_gen, dtype=score_dtype)

            (i_to_id, condensed_distances) = condensedDistance(pairs)
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage, threshold, criterion='distance')

            for (i, sub_cluster_id) in enumerate(partition):
                clustering.setdefault(cluster_id + sub_cluster_id, []).append(i_to_id[i])
            cluster_id += max(partition)

        else :
            clustering[cluster_id] = sub_graph
            cluster_id += 1

    clusters = [set(l) for l in clustering.values() if len(l) > 1]

    
    return clusters
 def init(self):
     self.chokepoints()
     ccs = list(connected_components(self.sbmlprocessor.graph.to_undirected()))
     cp = sorted(ccs, key=lambda x: len(x))[-1]
     centrality_json = self.work_dir + "/centrality.json"
     if os.path.exists(centrality_json):
         self.centrality = json.load(open(centrality_json))
     else:
         self.centrality = {x: 0 for x in self.sbmlprocessor.graph.nodes()}
         cc = sorted(nx.connected_components(self.sbmlprocessor.graph.to_undirected()), key=lambda x: len(x))[-1]
         g2 = self.sbmlprocessor.graph.to_undirected().subgraph(cc)
         self.centrality = {x: (y if x in cp else 0) for x, y in
                            betweenness_centrality(g2).items()}
         json.dump(self.centrality, open(centrality_json, "w"))
Beispiel #22
0
    def connectedComponents3(self):
        """Finds connected components of the graph (the molecules).

        @rtype:  list of instances of ChemicalGraph
        @return: List of connected components.
        """
        result = []
        comp_conexos = connected_components(self)
        quedan = self.nodes()
        edges = self.edges()

        for comp in comp_conexos:
            result_graph = ChemicalGraph()
            result_graph.add_nodes(comp)

            for atom in result_graph.nodes():
                result_graph.add_node_attribute(atom,
                                                self.getAtomAttributes(atom))

            for index in edges:
                if (index[0] in pre_ordering) and (
                        index[1] in pre_ordering
                ) and not (index in result_graph.edges()):
                    result_graph.add_edge(index)
                    #print result_graph

            # copy corresponding angles and dihedrars
            for atom in result_graph.nodes():
                for angle in result_graph._angles:
                    try:
                        pos = angle.index(atom)
                        result_graph._angles.append(angle)
                        break
                    except ValueError:
                        pass
                for dihedral in result_graph._dihedrals:
                    try:
                        pos = dihedral.index(atom)
                        result_graph._dihedrals.append(dihedral)
                        break
                    except ValueError:
                        pass

            result.append(result_graph)

            for v in pre_ordering:
                quedan.remove(v)

        return result
Beispiel #23
0
    def getConnectedComponents(cls, groupNodes):
        """Get connected components from group of nodes.

        Parameters
        ----------
        groupNodes : list
            list of list of points

        Returns
        -------
        list
            list of connected components

        """
        G = cls.to_graph(groupNodes)
        return list(connected_components(G))
Beispiel #24
0
    def test_message_number_graph(self, mock_rec_list_splice, mock_correctLastCharCR, mock_get_nick_sen_rec, mock_get_year_month_day,\
                         mock_get_nick_representative, mock_check_if_msg_line, mock_create_connected_nick_list, mock_to_graph):
        to_graph_ret = util.load_from_disk(
            current_directory + "/data/message_number_graph/to_graph")

        conn_list = list(connected_components(to_graph_ret))

        mock_to_graph.return_value = to_graph_ret
        mock_rec_list_splice.side_effect = util.load_from_disk(
            current_directory + "/data/message_number_graph/rec_list_splice")
        mock_create_connected_nick_list.return_value = util.load_from_disk(
            current_directory + "/data/message_number_graph/conn_comp_list")
        #mock_correct_last_char_list.side_effect = util.load_from_disk(current_directory + "/data/message_number_graph/correct_last_char_list")
        mock_check_if_msg_line.side_effect = util.load_from_disk(
            current_directory + "/data/message_number_graph/check_if_msg_line")
        mock_correctLastCharCR.side_effect = util.load_from_disk(
            current_directory + "/data/message_number_graph/correctLastCharCR")
        mock_get_nick_sen_rec.side_effect = util.load_from_disk(
            current_directory + "/data/message_number_graph/get_nick_sen_rec")
        #mock_extend_conversation_list.side_effect = util.load_from_disk(current_directory + "/data/message_number_graph/extend_conversation_list")
        mock_get_nick_representative.side_effect = util.load_from_disk(
            current_directory +
            "/data/message_number_graph/get_nick_representative")
        mock_get_year_month_day.side_effect = util.load_from_disk(
            current_directory +
            "/data/message_number_graph/get_year_month_day")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput

        ret = network.message_number_graph(self.log_data,
                                           self.nicks,
                                           self.nick_same_list,
                                           DAY_BY_DAY_ANALYSIS=False)

        sys.stdout = sys.__stdout__
        capturedOutput.close()

        mock_to_graph.assert_called_once_with(self.nick_same_list)
        mock_create_connected_nick_list.assert_called_once_with(conn_list)
        self.assertTrue(
            nx.is_isomorphic(
                ret,
                util.load_from_disk(
                    current_directory +
                    "/data/message_number_graph/aggregate_message_number_graph"
                )))
Beispiel #25
0
def mif_preprocess_1(g: MultiGraph, f: set, active_v, k: int) -> set:
	if nxc.number_connected_components(g) >= 2:
		mif_set = set()
		for component in nxc.connected_components(g):
			f_i = component.intersection(f)
			gx = g.subgraph(component)
			component_mif_set = mif_preprocess_2(gx, f_i, active_v, None)
			if component_mif_set:
				mif_set = mif_set.union(component_mif_set)
				if k != None:
					k -= (len(component_mif_set) - len(f_i))
					if k <= 0:
						return mif_set
		if k == None or len(mif_set) >= k:
			return mif_set
		return None
	return mif_preprocess_2(g, f, active_v, k)
Beispiel #26
0
    def test_message_number_graph_day_analysis(self, mock_get_nick_sen_rec,
                                               mock_rec_list_splice,
                                               mock_correctLastCharCR,
                                               mock_check_if_msg_line,
                                               mock_create_connected_nick_list,
                                               mock_to_graph):
        to_graph_ret = util.load_from_disk(
            self.current_directory + "/data/message_number_graph/to_graph")

        conn_list = list(connected_components(to_graph_ret))

        mock_to_graph.return_value = to_graph_ret
        mock_rec_list_splice.side_effect = util.load_from_disk(
            self.current_directory +
            "/data/message_number_graph/rec_list_splice")
        mock_create_connected_nick_list.return_value = util.load_from_disk(
            self.current_directory +
            "/data/message_number_graph/conn_comp_list")
        mock_check_if_msg_line.side_effect = util.load_from_disk(
            self.current_directory +
            "/data/message_number_graph/check_if_msg_line")
        mock_correctLastCharCR.side_effect = util.load_from_disk(
            self.current_directory +
            "/data/message_number_graph/correctLastCharCR")
        mock_get_nick_sen_rec.side_effect = util.load_from_disk(
            self.current_directory +
            "/data/message_number_graph/get_nick_sen_rec")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput

        ret = network.message_number_graph(self.log_data,
                                           self.nicks,
                                           self.nick_same_list,
                                           DAY_BY_DAY_ANALYSIS=True)
        expected_graph_list = util.load_from_disk(
            self.current_directory +
            "/data/message_number_graph/message_number_day_list")

        sys.stdout = sys.__stdout__
        capturedOutput.close()

        mock_to_graph.assert_called_once_with(self.nick_same_list)
        mock_create_connected_nick_list.assert_called_once_with(conn_list)
        self.assertTrue(nx.is_isomorphic(ret[0][0], expected_graph_list[0][0]))
        self.assertTrue(nx.is_isomorphic(ret[1][0], expected_graph_list[1][0]))
Beispiel #27
0
def hierarchical_cluster(clusters, threshold):
    threshold = 1 - threshold
    score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)]

    #    lclassifier.predict_proba(distances)[0][1] > threshold

    dupe_graph = networkx.Graph()
    dupe_graph.add_weighted_edges_from((x[0], x[1], y) for (x, y) in clusters)

    dupe_sub_graphs = connected_components(dupe_graph)

    clustering = {}
    cluster_scores = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 2:
            pair_gen = ((sorted(x[0:2]), x[2]['weight'])
                        for x in dupe_graph.edges_iter(sub_graph, data=True))

            pairs = np.fromiter(pair_gen, dtype=score_dtype)
            pairlist = list(pairs)

            (i_to_id, condensed_distances) = condensedDistance(pairs)
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage,
                                          threshold,
                                          criterion='distance')

            for (i, sub_cluster_id) in enumerate(partition):
                clustering.setdefault(cluster_id + sub_cluster_id,
                                      []).append(i_to_id[i])

            cluster_id += max(partition)
        elif len(sub_graph) == 2:
            clustering[cluster_id] = sub_graph
            cluster_id += 1

    clusters = [set(l) for l in clustering.values() if len(l) >= 2]
    return (clusters, cluster_scores)
def main():
    """
        Reads arguments, finds matches from the inputs, shows matches as groups
        and plots a graph
    """
    parser = argparse.ArgumentParser(
        description='This program prints out groups of words that have a certain' \
         ' Levenshtein edit distance')

    parser.add_argument(
        '--ratio',
        dest='min_match_ratio',
        choices=[str(i) for i in range(0, 101)],
        help='Number that determines the Levenshtein edit distance, should be between 0 and 100'
    )
    parser.add_argument(
        '--files',
        dest='files',
        default='',
        nargs='+',
        help='The files that contains new line separated words'
    )
    parser.add_argument(
        '--version',
        action='version',
        version='%(prog)s 1.0alpha'
    )

    opts = parser.parse_args()
    min_match_ratio = int(opts.min_match_ratio or 80)

    words = read_files_into_list(opts.files)

    # the following converts the list to a graph that will merge lists if they have shared items
    matches = find_matches(words, min_match_ratio)
    graph = to_graph(matches)
    draw_cluster(graph, networkx.spring_layout(graph))
    for group in list(connected_components(graph)):
        print(group)

    plot.show()
    input('Press enter to continue...')
Beispiel #29
0
def group_alignments(alignments, contig_names):
    contig_groupings = []
    for alignment in alignments:
        if alignment.qseqid in contig_names:
            contig_names.remove(alignment.qseqid)
        if alignment.sseqid in contig_names:
            contig_names.remove(alignment.sseqid)
        contig_groupings.append([alignment.qseqid, alignment.sseqid])
    G = to_graph(contig_groupings)
    groupings = {}
    i = 0
    for group in connected_components(G):
        groupings[i] = group
        i += 1
    if len(groupings.keys()) != 0:
        max_group = max(groupings.keys())
    else:
        max_group = 0
    for i, contig in enumerate(contig_names):
        groupings[i + max_group + 1] = [contig]
    return groupings
Beispiel #30
0
def merge_boxes_in_results(results_dict, min_conf_threshold, iou_threshold):
    final_results = Results()

    # Clean dict to remove min_conf_threshold
    for _, regions in results_dict.items():
        to_remove = []
        for r in regions:
            if r.conf < min_conf_threshold:
                to_remove.append(r)
        for r in to_remove:
            regions.remove(r)

    for fid, regions in results_dict.items():
        overlap_pairwise_list = pairwise_overlap_indexing_list(
            regions, iou_threshold)
        overlap_graph = to_graph(overlap_pairwise_list)
        grouped_bbox_idx = [c for c in sorted(
            connected_components(overlap_graph), key=len, reverse=True)]
        merged_regions = simple_merge(regions, grouped_bbox_idx)
        for r in merged_regions:
            final_results.append(r)
    return final_results
Beispiel #31
0
def getIndependentGDTGroups_py(data_slice,
                               threshold,
                               # n_jobs=1,
                               rand_gen=None):

    gdt_adjacency_matrix = pairwise_gdt(data_slice,
                                        )

    n_features = len(data_slice.cols)

    #
    # thresholding
    gdt_adjacency_matrix[gdt_adjacency_matrix < threshold] = 0
    #print("thresholding", gdt_adjacency_matrix)

    #
    # getting connected components
    result = numpy.zeros(n_features)
    for i, c in enumerate(connected_components(from_numpy_matrix(gdt_adjacency_matrix))):
        result[list(c)] = i + 1

    return result
Beispiel #32
0
    def test_message_time_graph(self, mock_get_nick_sen_rec, mock_correct_last_char_list, \
                                mock_rec_list_splice, mock_check_if_msg_line, mock_create_connected_nick_list,
                                mock_to_graph):
        to_graph_ret = util.load_from_disk(self.test_data_dir +
                                           "message_time_graph/to_graph")

        conn_list = list(connected_components(to_graph_ret))

        mock_to_graph.return_value = to_graph_ret
        mock_rec_list_splice.side_effect = util.load_from_disk(
            self.test_data_dir + "message_time_graph/rec_list_splice")
        mock_create_connected_nick_list.return_value = util.load_from_disk(
            self.test_data_dir + "message_time_graph/conn_comp_list")
        mock_check_if_msg_line.side_effect = util.load_from_disk(
            self.test_data_dir + "message_time_graph/check_if_msg_line")
        mock_get_nick_sen_rec.side_effect = util.load_from_disk(
            self.test_data_dir + "message_time_graph/get_nick_sen_rec")
        mock_correct_last_char_list.side_effect = util.load_from_disk(
            self.test_data_dir + "message_time_graph/correct_last_char_list")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput

        graph = network.message_time_graph(self.log_data,
                                           self.nicks,
                                           self.nick_same_list,
                                           DAY_BY_DAY_ANALYSIS=False)

        sys.stdout = sys.__stdout__
        capturedOutput.close()

        mock_to_graph.assert_called_once_with(self.nick_same_list)
        mock_create_connected_nick_list.assert_called_once_with(conn_list)
        self.assertTrue(
            nx.is_isomorphic(
                graph,
                util.load_from_disk(self.test_data_dir +
                                    "message_time_graph/msg_time_aggr_graph")))
Beispiel #33
0
def getIndependentGroupsStabilityTest(data, alpha=0.001):
    #data = numpy.loadtxt("/Users/alejomc/Dropbox/pspn/spyn/experiments/graphclassification/wl/1mutag.build_wl_corpus.csv", dtype=int, delimiter=",")
    #df = pandas.read_csv('/Users/alejomc/Dropbox/pspn/spyn/experiments/graphclassification/wl/1mutag.build_wl_corpus.csv')
    #df = pandas.read_csv('/Users/alejomc/Dropbox/pspn/spyn/experiments/graphclassification/wl/5nci1.build_wl_corpus.csv')

    df = DataFrame(data,
                   columns=["V" + str(i) for i in range(1, data.shape[1] + 1)])

    #pvals = bonferroniCorrection(computeEstabilityTest(df, 0))

    #compute stability test
    with Pool() as pool:
        pvals = pool.starmap(computePvals, zip(repeat(df), range(df.shape[1])))

    #print(pvals)

    pvals = numpy.asarray(pvals)

    #print(pvals[0,:])

    #convert graph to undirected graph

    #print("AM SHAPE ",pvals.shape)

    for i, j in zip(*numpy.tril_indices(pvals.shape[1])):
        pvals[i, j] = pvals[j, i] = min(pvals[i, j], pvals[j, i])

    pvals[numpy.diag_indices_from(pvals)] = 1

    #print(pvals)

    pvals[pvals > alpha] = 0

    result = numpy.zeros(df.shape[1])
    for i, c in enumerate(connected_components(from_numpy_matrix(pvals))):
        result[list(c)] = i + 1

    return result
Beispiel #34
0
    def test_message_number_graph(self):
        to_graph_ret = util.load_from_disk(self.test_data_dir +
                                           "message_number_graph/to_graph")

        conn_list = list(connected_components(to_graph_ret))

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput

        ret = network.message_number_graph(self.log_data,
                                           self.nicks,
                                           self.nick_same_list,
                                           DAY_BY_DAY_ANALYSIS=False)

        sys.stdout = sys.__stdout__
        capturedOutput.close()

        self.assertTrue(
            nx.is_isomorphic(
                ret,
                util.load_from_disk(
                    self.test_data_dir +
                    "message_number_graph/aggregate_message_number_graph")))
Beispiel #35
0
def group_alignments(alignments, fosmids_to_ignore, fosmid_names):
    fosmid_groupings = []
    for alignment in alignments:
        if alignment.qseqid in fosmids_to_ignore:
            continue
        if alignment.sseqid in fosmids_to_ignore:
            continue
        if alignment.qseqid in fosmid_names:
            fosmid_names.remove(alignment.qseqid)
        if alignment.sseqid in fosmid_names:
            fosmid_names.remove(alignment.sseqid)
        fosmid_groupings.append([alignment.qseqid, alignment.sseqid])
    G = to_graph(fosmid_groupings)
    groupings = {}
    for i, group in enumerate(connected_components(G)):
        groupings[i] = group
    if len(groupings.keys()) != 0:
        max_group = max(groupings.keys())
    else:
        max_group = 0
    for i, fosmid in enumerate(fosmid_names):
        groupings[i + max_group + 1] = [fosmid]
    return groupings
Beispiel #36
0
def mif_preprocess_2(g: MultiGraph, f: set, active_v, k: int) -> set:
	mif_set = set()
	while not is_independent_set(g, f):
		mif_set = mif_set.union(f)
		for component in nxc.connected_components(g.subgraph(f)):
			if len(component) > 1:
				if active_v in component:
					active_v = component.pop()
					compressed_node = active_v
				else:
					compressed_node = component.pop()
				g = compress(g, component, compressed_node, True)
				f = f.intersection(g.nodes())
				# Maybe faster with
				# f = f.difference(component)
				# f.add(compressed_node)
				mif_set = mif_set.union(component)
				break
	mif_set2 = mif_main(g, f, active_v, k)
	if mif_set2:
		mif_set = mif_set2.union(mif_set)
	if k == None or len(mif_set) >= k:
		return mif_set
	return None
Beispiel #37
0
def message_time_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False):
    """ creates a directed graph where each edge denotes a message sent from a user to another user
    with the stamp denoting the time at which the message was sent

    Args:
        log_dict (dictionary): Dictionary of logs data created using reader.py
        nicks(List) : List of nickname created using nickTracker.py
        nick_same_list(List) :List of same_nick names created using nickTracker.py

    Returns:
       msg_time_graph_list(List): List of message time graphs for different days
       msg_time_aggr_graph: aggregate message time graph where edges are date + time when sender sends a message to receiver
    """  
    msg_time_graph_list = []
    msg_time_aggr_graph = nx.MultiDiGraph()
    G = util.to_graph(nick_same_list)
    conn_comp_list = list(connected_components(G))

    def compare_spliced_nick(nick_to_compare, spliced_nick, nick_name, line):
        if(nick_to_compare == nick_name):
            if(spliced_nick != nick_name):
                nick_receiver = nick_receiver_from_conn_comp(nick_name, conn_comp_list)        
                util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph)             
     
    util.create_connected_nick_list(conn_comp_list)

    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            year, month, day = util.get_year_month_day(day_content)
            graph_conversation = nx.MultiDiGraph()  #graph with multiple directed edges between clients used
            for line in day_log:
                flag_comma = 0
                if(util.check_if_msg_line (line)):
                    m = re.search(r"\<(.*?)\>", line)         
                    spliced_nick = util.correctLastCharCR(m.group(0)[1:-1])
                    nick_sender = ""                          
                    nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, spliced_nick, conn_comp_list, nick_sender)

                    for nick_name in nicks:
                        rec_list = [e.strip() for e in line.split(':')]  #receiver list splited about :
                        util.rec_list_splice(rec_list)
                        if not rec_list[1]:  #index 0 will contain time 14:02
                            break                        
                        rec_list = util.correct_last_char_list(rec_list)        
                        for nick_to_search in rec_list:
                            if(nick_to_search == nick_name):
                                if(spliced_nick != nick_name):                                    
                                    nick_receiver = ""                                         
                                    nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick_name, conn_comp_list, nick_receiver)                                            
                                    util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph)

                        if "," in rec_list[1]:  #receiver list may of the form <Dhruv> Rohan, Ram :
                            flag_comma = 1
                            rec_list_2 = [e.strip() for e in rec_list[1].split(',')]
                            rec_list_2 = util.correct_last_char_list(rec_list_2)        
                            for nick_to_search in rec_list_2:                              
                                compare_spliced_nick(nick_to_search, spliced_nick, nick_name, line)   

                        if(flag_comma == 0):  #receiver list can be <Dhruv> Rohan, Hi!
                            rec = line[line.find(">") + 1:line.find(", ")]
                            rec = util.correctLastCharCR(rec[1:])                           
                            compare_spliced_nick(rec, spliced_nick, nick_name, line)    

            msg_time_graph_list.append(graph_conversation)

    if DAY_BY_DAY_ANALYSIS:
        return msg_time_graph_list
    else:
        return msg_time_aggr_graph
Beispiel #38
0
for i, v in enumerate(statics.values()):
	tempsum += len(v)
	
print "Avg neighbours: {:.2f}, total nodes: {:3d}".format(tempsum*1.0/i, len(statics))
print sorted(statics.keys())
	
	
	
	
for node, neighbour_list in statics.iteritems():
	for nid, num in neighbour_list.iteritems():
		G.add_edge(node, nid)
			
pos = nx.graphviz_layout(G)
nodelist = G.nodes()
print connected_components(G)
labels={}

pl.figure()
for node in nodelist:
	labels[node] = node
	
nx.draw_networkx_nodes(G, pos, node_size = 120, node_color='r')
#nx.draw_networkx_nodes(G, pos, node_size = 200, nodelist=[20,75], node_color='b')
nx.draw_networkx_edges(G, pos, alpha=0.5)

nx.draw_networkx_labels(G,pos, labels=labels, font_size=12)
pl.savefig("connectivity.pdf")
pl.show()
################################################ABOVE IS CONNECTIVITY################################
Beispiel #39
0
	def gcc_ratio(self ):
		cc_sizes = list()
		for cc in connected_components( self._network ):
			cc_sizes.append( len( cc) ) 
		return  float( max( cc_sizes ) )/ float( self._network.number_of_nodes() )
Beispiel #40
0
def response_time(log_dict, nicks, nick_same_list):

	""" finds the response time of a message 
	i.e. the best guess for the time at which one can expect a reply for his/her message.

	Args:
		log_dict (str): Dictionary of logs data created using reader.py
		nicks(List) : List of nickname created using nickTracker.py
		nick_same_list :List of same_nick names created using nickTracker.py
		output_directory (str): Location of output directory
		
	Returns:
	   rows_RT(zip List): Response Time (This refers to the response
		time of a message i.e. the best guess for the time at
		which one can expect a reply for his/her message)

	"""
	G = util.to_graph(nick_same_list)
	conn_comp_list = list(connected_components(G))

	util.create_connected_nick_list(conn_comp_list)
	
	graph_cumulative = []
	graph_x_axis = []
	graph_y_axis = []

	def build_mean_list(conversations, index, mean_list):
		for j in range(2, len(conversations[index])):
			mean_list.append(conversations[index][j])
		return mean_list

	def resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list):
		if(rec == nick):
			send_time.append(line[1:6])
			if(nick_to_search != nick):
				nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver)								
				for i in range(config.MAX_RESPONSE_CONVERSATIONS):
					if (nick_sender in conversations[i] and nick_receiver in conversations[i]): 
						conversations[i].append(line[1:6])
						break
					if(len(conversations[i]) == 0):
						conversations[i].append(nick_sender)
						conversations[i].append(nick_receiver)
						conversations[i].append(line[1:6])
						break		
		return conversations, nick_receiver, send_time				

	for day_content_all_channels in log_dict.values():
		for day_content in day_content_all_channels:
			day_log = day_content["log_data"]

			send_time = []  #list of all the times a user sends a message to another user
			meanstd_list = []
			totalmeanstd_list = []
			x_axis = []
			y_axis = []
			real_y_axis = []             
			conversations = [[] for i in range(config.MAX_RESPONSE_CONVERSATIONS)]

			#code for making relation map between clients       
			for line in day_log:
				flag_comma = 0
				if(util.check_if_msg_line (line)):
					nick_sender = ""
					nick_receiver = ""
					m = re.search(r"\<(.*?)\>", line)
					nick_to_search = util.correctLastCharCR(m.group(0)[1:-1])
					nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender)					         
					for nick in nicks:
						rec_list = [e.strip() for e in line.split(':')]
						util.rec_list_splice(rec_list)

						if not rec_list[1]:
							break						
						rec_list = util.correct_last_char_list(rec_list)		
						
						for name in rec_list:
							conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list)							
							
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2 = [e.strip() for e in rec_list[1].split(',')]							
							rec_list_2 = util.correct_last_char_list(rec_list_2)		
								
							for name in rec_list_2:
								conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list)								

						if(flag_comma == 0):
							rec = util.splice_find(line, ">", ", ",1)							
							conversations, nick_receiver, send_time = resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list)						
			
			for i in range(config.MAX_RESPONSE_CONVERSATIONS):
				if(len(conversations[i]) != 0):  
					for j in range(2, len(conversations[i]) - 1):
						conversations[i][j]=(int(conversations[i][j+1][0:2])*config.MINS_PER_HOUR+int(conversations[i][j+1][3:5])) - (int(conversations[i][j][0:2])*config.MINS_PER_HOUR+int(conversations[i][j][3:5]))
	
			for i in range(config.MAX_RESPONSE_CONVERSATIONS):
				if(len(conversations[i]) != 0): 
					if(len(conversations[i]) == 3):
						conversations[i][2] = int(conversations[i][2][0:2])*config.MINS_PER_HOUR+int(conversations[i][2][3:5])     
					else: 
						del conversations[i][-1]

		#Explanation provided in parser-CL+CRT.py
			for i in range(config.MAX_RESPONSE_CONVERSATIONS):
				if(len(conversations[i]) != 0):					
					totalmeanstd_list = build_mean_list(conversations, i, totalmeanstd_list)					

			if(len(totalmeanstd_list) != 0):
				for i in range(max(totalmeanstd_list) + 1):
					x_axis.append(i)

				for i in x_axis:
					y_axis.append(float(totalmeanstd_list.count(i)) / float(len(totalmeanstd_list)))
				
				#finding the probability of each RT to occur=No. of occurence/total occurences.
				real_y_axis.append(y_axis[0])
				for i in range(len(y_axis)):
					real_y_axis.append(float(real_y_axis[i-1]) + float(y_axis[i]))
			
			#to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry.
			for i in range(len(totalmeanstd_list)):
				graph_cumulative.append(totalmeanstd_list[i])

			if len(totalmeanstd_list) > 0:
				totalmeanstd_list.append(numpy.mean(totalmeanstd_list))
				totalmeanstd_list.append(numpy.mean(totalmeanstd_list)+2*numpy.std(totalmeanstd_list))
		
			for i in range(config.MAX_RESPONSE_CONVERSATIONS):
				if(len(conversations[i]) != 0):					
					meanstd_list = build_mean_list(conversations, i, meanstd_list)					
					conversations[i].append(numpy.mean(meanstd_list))
					conversations[i].append(numpy.mean(meanstd_list)+(2*numpy.std(meanstd_list)))
					meanstd_list[:] = []

	graph_cumulative.sort()

	for i in range(graph_cumulative[len(graph_cumulative)-1] + 1):
		graph_y_axis.append(graph_cumulative.count(i))     # problem when ti=0 count is unexpectedly large
		graph_x_axis.append(i)		

	#Finally storing the RT values along with their frequencies in a csv file. 
	rows_rt = zip(graph_x_axis, graph_y_axis)
	return rows_rt	
Beispiel #41
0
def conv_len_conv_refr_time(log_dict, nicks, nick_same_list):

	""" Calculates the conversation length (CL) that is the length of time for which two users communicate 
	i.e. if a message is not replied to within Response Time(RT), 
	then it is considered as a part of another conversation.
	This function also calculates the conversation refresh time(CRT)
	For a pair of users, this is the time when one conversation ends and another one starts.
	Args:   
		log_dict (str): Dictionary of logs data created using reader.py
		nicks(List) : list of nickname created using nickTracker.py
		nick_same_list :List of same_nick names created using nickTracker.py        
	Returns:
		row_cl(zip List): Conversation Length
		row_crt(zip List) :Conversation Refresh time
	   
	""" 
	conv = []
	conv_diff = []  

	G = util.to_graph(nick_same_list)
	conn_comp_list = list(connected_components(G))

	util.create_connected_nick_list(conn_comp_list)

	# We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g.
	#Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user.

	conversations=[[] for i in range(config.MAX_CONVERSATIONS)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too.
											 ## I would advice on using a different data structure which does not have an upper bound like we do in arrays.
	graphx1 =[]
	graphy1 =[]
	graphx2 =[]
	graphy2 =[]

	dateadd = -1 #Variable used for response time calculation. Varies from 0-365.

	def build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list):
		for names in rec_list:
			conversations, nick_receiver, send_time = conv_helper(names, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list)			
		return  conversations, nick_receiver, send_time

	def conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list):
		if(rec == nick):
			send_time.append(line[1:6])
			if(nick_to_search != nick):
				nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver)				
				for i in range(config.MAX_CONVERSATIONS):
					if (nick_sender in conversations[i] and nick_receiver in conversations[i]):
						conversations = conv_append(conversations, i, dateadd, line)
						break
					if(len(conversations[i]) == 0):
						conversations[i].append(nick_sender)
						conversations[i].append(nick_receiver)						
						conversations = conv_append(conversations, i, dateadd, line)
						break
		return  conversations, nick_receiver, send_time

	def conv_mat_diff(i,j,conversations):
		"""
		i(int): matrix index for row 
		j(int): matrix index for column
		"""
		return (conversations[i][j]-conversations[i][j-1])	

	def conv_append(conversations, index, dateadd, line):
		conversations[index].append(config.HOURS_PER_DAY*config.MINS_PER_HOUR*dateadd + int(line[1:6][0:2])*config.MINS_PER_HOUR + int(line[1:6][3:5]))
		return conversations

	for day_content_all_channels in log_dict.values():
		for day_content in day_content_all_channels:
			day_log = day_content["log_data"]

			dateadd = dateadd + 1
			send_time = [] #list of all the times a user sends a message to another user		
			#code for making relation map between clients       
			for line in day_log:
				flag_comma = 0
				if(util.check_if_msg_line (line)):
					nick_sender = ""
					nick_receiver = ""
					m = re.search(r"\<(.*?)\>", line)
					nick_to_search = util.correctLastCharCR(m.group(0)[1:-1])
					nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender)				
						
					for nick in nicks:
						rec_list = [e.strip() for e in line.split(':')]
						util.rec_list_splice(rec_list)						
						if not rec_list[1]:
							break						
						rec_list = util.correct_last_char_list(rec_list)							
						conversations, nick_receiver, send_time = build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list)
													
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2 = [e.strip() for e in rec_list[1].split(',')]							
							rec_list_2 = util.correct_last_char_list(rec_list_2)
							conversations, nick_receiver, send_time = build_conversation(rec_list_2, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list)		
							
						if(flag_comma == 0):
							rec = util.splice_find(line, ">", ", ", 1)							            
							conversations, nick_receiver, send_time	= conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list)			
		
	#Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year.

	for i in range(len(conversations)):       #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs)
		if(len(conversations[i]) != 0):              # response times are calculated starting from index 2. So now we have all the response times in conversations.
			del conversations[i][0:2]

	for i in range(len(conversations)):
		if(len(conversations[i]) != 0):
			first = conversations[i][0]
			for j in range(1, len(conversations[i])):
					if(conv_mat_diff(i, j, conversations) > 9):

						conv.append(conversations[i][j-1] - first)    #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response
																	#time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code.
						conv_diff.append(conv_mat_diff(i, j, conversations))
						first = conversations[i][j]
					if(j == (len(conversations[i]) - 1)):
						conv.append(conversations[i][j] - first)                    
						break						

	def build_conv_csv(conv_list, graph_x, graph_y):

		for i in range(max(conv_list)):
			graph_x.append(i)
			graph_y.append(conv_list.count(i))

		return graph_x, graph_y

	graphx1, graphy1 = build_conv_csv(conv, graphx1, graphy1)
	graphx2, graphy2 = build_conv_csv(conv_diff, graphx2, graphy2)	

	#To plot CDF we store the CL and CRT values and their number of occurences as shown above.

	row_cl = zip(graphx1, graphy1)
	row_crt = zip(graphx2, graphy2)
	
	return row_cl, row_crt
Beispiel #42
0
  def findBonds(self, ratio=setting.bond_ratio, **kwargs):
    del self.segments
    del self.bond_types
    self.segments = []
    self.bond_types = {}
    if 'no_report' not in kwargs or not kwargs['no_report']:
      qtk.report("Molecule", 
                 "finding bonds with cutoff ratio", 
                 ratio)
    def to_graph(l):
      G = networkx.Graph()
      for part in l:
        # each sublist is a bunch of nodes
        G.add_nodes_from(part)
        # it also imlies a number of edges:
        G.add_edges_from(to_edges(part))
      return G
    
    def to_edges(l):
      """ 
      treat `l` as a Graph and returns it's edges 
      to_edges(['a','b','c','d']) -> [(a,b), (b,c),(c,d)]
      """
      it = iter(l)
      last = next(it)
    
      for current in it:
        yield last, current
        last = current 
    itr = 0
    bond_list = []
    bonded = [False for i in range(self.N)]
    for i in xrange(self.N):
      for j in xrange(i+1, self.N):
        d_ij = np.linalg.norm(self.R[i,:] - self.R[j,:])
        atom_i = getattr(pt, self.type_list[i])
        atom_j = getattr(pt, self.type_list[j])
        Ri = atom_i.covalent_radius + \
             atom_i.covalent_radius_uncertainty
        Rj = atom_j.covalent_radius + \
             atom_j.covalent_radius_uncertainty
        Dij = (Ri+Rj) * float(ratio)
        if d_ij < Dij:
          bonded[i] = True
          bonded[j] = True
          if self.Z[i] < self.Z[j]:
            atom_begin = self.Z[i]
            atom_end = self.Z[j]
            index_begin = i
            index_end = j
          else:
            atom_begin = self.Z[j]
            atom_end = self.Z[i]
            index_begin = j
            index_end = i
          self.bonds[itr] = {'atom_begin'  : atom_begin,
                             'index_begin' : index_begin,
                             'atom_end'    : atom_end,
                             'index_end'   : index_end,
                             'length'      : d_ij}
          bond_list.append([i, j])
          type_begin = qtk.Z2n(atom_begin)
          type_end   = qtk.Z2n(atom_end)
          bond_table = qtk.data.elements.bond_table
          bond_keys = []
          bond_keys = [
            type_begin + _ + type_end for _ in ['-', '=', '#']
          ]
          try:
            bond_type_ind = np.argmin(
              abs(
                np.array([
                          bond_table[k][0] for k in bond_keys
                          if k in bond_table.keys()
                         ]) - d_ij
              )
            )
          except Exception as _e:
            self.write_xyz()
            qtk.exit(
              "error while processing bond" +\
              str(bond_keys) + "with error message %s" % str(_e))
          bond_type = bond_keys[bond_type_ind]
          self.bonds[itr]['name'] = bond_type
          bond_energy = \
            bond_table[bond_keys[bond_type_ind]][1] * \
            qtk.convE(1, 'kj-kcal')[0]
          self.bonds[itr]['energy'] = bond_energy
          if np.isnan(bond_energy):
            qtk.warning("Non-tabliated covalent bond %s" % bond_type)
          if bond_type in self.bond_types:
            self.bond_types[bond_type] += 1
          else:
            self.bond_types[bond_type] = 1
          itr += 1

    segments = list(connected_components(to_graph(bond_list)))
    for s in range(len(segments)):
      segment = list(segments[s])
      new_mol = self.getSegment(segment, **kwargs)
      ns = len(self.segments)
      new_mol.name = new_mol.name + '_%d' % ns
      self.segments.append(new_mol)
    for s in [i for i in range(self.N) if not bonded[i]]:
      segment = [s]
      new_mol = self.getSegment(segment, **kwargs)
      ns = len(self.segments)
      new_mol.name = new_mol.name + '_%d' % ns
      self.segments.append(new_mol)
#print(x)  
for ni in nicks:
	for ind in range(500):
		if ni in x[ind]:
			break
		if not x[ind]:
			x[ind].append(ni)
			break

#print("*********************x**********************************")
#print(x)


G = to_graph(x)
L = list(connected_components(G))

	

for i in range(1,len(L)+1):
	L[i-1] = list(L[i-1])

#print(L)
	


for iterator in range(1,13):
	for fileiterator in range(1,32):
		if(fileiterator<10): 
			sttring="/home/dhruvie/LOP/2013/"+str(iterator)+"/0" 
			sttring=sttring+str(fileiterator)+"/#kubuntu-devel.txt"
def implementWithIgraphs(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):

	nick_same_list=[[] for i in range(5000)]
	nicks = [] #list of all the nicknames

	conversations=[[] for i in range(5000)]
	for i in xrange(0,5000):
		conversations[i].append(0)

	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
				content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   
	
			print(filePath)
			send_time = [] #list of all the times a user sends a message to another user
			nicks_for_the_day = []
			channel= "#kubuntu-devel" #channel name
			
			#code for getting all the nicknames in a list
			for i in content:
				if(i[0] != '=' and "] <" in i and "> " in i):
					m = re.search(r"\<(.*?)\>", i)
					if m.group(0) not in nicks_for_the_day:                       
						nicks_for_the_day.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list

			for i in xrange(0,len(nicks_for_the_day)):
				if nicks_for_the_day[i][1:-1] not in nicks:
					nicks.append(nicks_for_the_day[i][1:-1])     #removed <> from the nicknames
				
			for i in xrange(0,len(nicks)):
				if(len(nicks[i])!=0):
						nicks[i]=correctLastCharCR(nicks[i])

			for j in content:
				if(j[0]=='=' and "changed the topic of" not in j):
					line1=j[j.find("=")+1:j.find(" is")]
					line2=j[j.find("wn as")+1:j.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					if(len(line1)!=0):
						line1=correctLastCharCR(line1)
						
					if(len(line2)!=0):
						line2=correctLastCharCR(line2)
					if line1 not in nicks:
						nicks.append(line1)
					if line2 not in nicks:
						nicks.append(line2)

			#code for forming list of lists for avoiding nickname duplicacy		
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					line1=line[line.find("=")+1:line.find(" is")]
					line2=line[line.find("wn as")+1:line.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					if(len(line1)!=0):
						line1=correctLastCharCR(line1)
						
					if(len(line2)!=0):
						line2=correctLastCharCR(line2)
					for i in range(5000):
						if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
							if line1 in nick_same_list[i] and line2 not in nick_same_list[i]:
								nick_same_list[i].append(line2)
								break
							if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: 
								nick_same_list[i].append(line1)
								break
							if line2 in nick_same_list[i] and line1 in nick_same_list[i]:
								break  
						if not nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break

	#print(x)  
	for ni in nicks:
		for ind in range(5000):
			if ni in nick_same_list[ind]:
				break
			if not nick_same_list[ind]:
				nick_same_list[ind].append(ni)
				break

	#print("*********************x**********************************")
	#print(nick_same_list)

	G = to_graph(nick_same_list)
	L = connected_components(G)

	for i in range(1,len(L)+1):
		L[i-1] = [i]+L[i-1]

	#print(L)
	#Uptil here we have all the nicks of the same user clustered together.

	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name 
			print(filePath)
		
			for line in content:
				flag_comma = 0
				if(line[0] != '=' and "] <" in line and "> " in line):
					m = re.search(r"\<(.*?)\>", line)
					var = m.group(0)[1:-1]
					var=correctLastCharCR(var)
					for d in range(len(nicks)):
						if var in L[d]:
							nick_sender = L[d][0]
							break
						
					for i in nicks:
						rec_list=[e.strip() for e in line.split(':')]
						rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
						rec_list[1]=rec_list[1][1:]
						if not rec_list[1]:
							break
						for ik in xrange(0,len(rec_list)):
							if(rec_list[ik]):
								rec_list[ik]=correctLastCharCR(rec_list[ik])
						for z in rec_list:
							if(z==i):
								send_time.append(line[1:6])
								if(var != i):  
									for d in range(len(nicks)):
										if i in L[d]:
											nick_receiver=L[d][0]
											break
										
									for rt in xrange(0,5000):
										if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]):
											if (nick_sender == conversations[rt][1] and nick_receiver == conversations[rt][2]):
												conversations[rt][0]=conversations[rt][0]+1
												break
										if(len(conversations[rt])==1):
											conversations[rt].append(nick_sender)
											conversations[rt].append(nick_receiver)
											conversations[rt][0]=conversations[rt][0]+1
											break
							
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2=[e.strip() for e in rec_list[1].split(',')]
							for ij in xrange(0,len(rec_list_2)):
								if(rec_list_2[ij]):
									rec_list_2[ij]=correctLastCharCR(rec_list_2[ij])
							for j in rec_list_2:
								if(j==i):
									send_time.append(line[1:6])
									if(var != i):   
										for d in range(len(nicks)):
											if i in L[d]:
												nick_receiver=L[d][0]
												break
										
										for rt in xrange(0,5000):
											if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]):
												if (nick_sender == conversations[rt][1] and nick_receiver == conversations[rt][2]):
													conversations[rt][0]=conversations[rt][0]+1
													break
											if(len(conversations[rt])==1):
												conversations[rt].append(nick_sender)
												conversations[rt].append(nick_receiver)
												conversations[rt][0]=conversations[rt][0]+1
												break

						if(flag_comma == 0):
							rec=line[line.find(">")+1:line.find(", ")] 
							rec=rec[1:]
							rec=correctLastCharCR(rec)
							if(rec==i):
								send_time.append(line[1:6])
								if(var != i):
									for d in range(len(nicks)):
										if i in L[d]:
											nick_receiver=L[d][0]
											break
									
									for rt in xrange(0,5000):
										if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): 
											if (nick_sender == conversations[rt][1] and nick_receiver == conversations[rt][2]):
												conversations[rt][0]=conversations[rt][0]+1
												break
										if(len(conversations[rt])==1):
											conversations[rt].append(nick_sender)
											conversations[rt].append(nick_receiver)
											conversations[rt][0]=conversations[rt][0]+1
											break
						
	G = Graph(directed=True)  #graph with multiple directed edges between clients used 
	#Notice how the syntax changes with python-igraphs as compared to networkx.

	vertex1=[]
	edge1=[]
	for fin in xrange(0,5000):
		if(len(conversations[fin])==3):
			if(str(conversations[fin][1]) not in vertex1):
				G.add_vertex(str(conversations[fin][1]))
				vertex1.append(str(conversations[fin][1]))
			if(str(conversations[fin][2]) not in vertex1):
				G.add_vertex(str(conversations[fin][2]))
				vertex1.append(str(conversations[fin][2]))  #vertex1 contains the vertex names.
			edge1.append(conversations[fin][0])          #edge1 contains the edge weights
			G.add_edge(str(conversations[fin][1]),str(conversations[fin][2]))  

	G.vs['name'] = vertex1
	G.es['label'] = edge1       #Here we add all the labels like color,name,id,weights etc that we want in our graph
	G.es['weight'] = edge1
	G.vs['id'] = G.vs['name']
	G.es['width'] = edge1

	#print(vertex1)
	#print(conversations)
	#G.write_adjacency("adja_wholeyear.csv",sep=',') #Igraphs has a simple function for printing the adjacency matrix of a graph to a csv file.
	
	G.write_pajek("checkpajek.net")  #writes a graph in pajek format.
	plot(G, "checkgraph.png",edge_width=rescale(edge1,out_range=(1, 15)),layout = G.layout_fruchterman_reingold(), edge_arrow_size=0.5, vertex_size=8)
Beispiel #45
0
def message_number_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False):
    """ Creates a directed graph
        with each node representing an IRC user
        and each directed edge has a weight which 
        mentions the number messages sent and recieved by that user 
        in the selected time frame.
    Args:
        log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name}
        nicks(list): list of all the nicks
        nick_same_list(list): list of lists mentioning nicks which belong to same users
    Returns:
       message_number_graph (nx graph object)
    """
    message_number_day_list = []
    conversations=[[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)]
    aggregate_message_number_graph = nx.DiGraph()  #graph with multiple directed edges between clients used

    G = util.to_graph(nick_same_list)
    conn_comp_list = list(connected_components(G))

    util.create_connected_nick_list(conn_comp_list)

    def msg_no_analysis_helper(rec_list, corrected_nick, nick, conn_comp_list,conversations,today_conversation):
        for receiver in rec_list:
            if(receiver == nick):
                if(corrected_nick != nick):                                 
                    nick_receiver = ''
                    nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick, conn_comp_list, nick_receiver)    

                    if DAY_BY_DAY_ANALYSIS:
                        today_conversation = util.extend_conversation_list(nick_sender, nick_receiver, today_conversation)
                    else:
                        conversations = util.extend_conversation_list(nick_sender, nick_receiver, conversations)

    def message_no_add_egde(message_graph, conversation):
        for index in xrange(config.MAX_EXPECTED_DIFF_NICKS):
            if(len(conversation[index]) == 3 and conversation[index][0] >= config.THRESHOLD_MESSAGE_NUMBER_GRAPH):
                if len(conversation[index][1]) >= config.MINIMUM_NICK_LENGTH and len(conversation[index][2]) >= config.MINIMUM_NICK_LENGTH:
                    message_graph.add_edge(conversation[index][1], conversation[index][2], weight=conversation[index][0])
        return message_graph


    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            today_conversation = [[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)]
            for line in day_log:
                flag_comma = 0

                if(util.check_if_msg_line (line)):
                    parsed_nick = re.search(r"\<(.*?)\>", line)
                    corrected_nick = util.correctLastCharCR(parsed_nick.group(0)[1:-1])
                    nick_sender = ""
                    nick_receiver = ""                    
                    nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, corrected_nick, conn_comp_list, nick_sender)        

                    for nick in nicks:
                        rec_list = [e.strip() for e in line.split(':')]
                        util.rec_list_splice(rec_list)
                        if not rec_list[1]:
                            break                        
                        rec_list = util.correct_last_char_list(rec_list)       
                        msg_no_analysis_helper(rec_list, corrected_nick, nick, conn_comp_list, conversations,today_conversation)

                        if "," in rec_list[1]:
                            flag_comma = 1
                            rec_list_2=[e.strip() for e in rec_list[1].split(',')]
                            for i in xrange(0,len(rec_list_2)):
                                if(rec_list_2[i]):
                                    rec_list_2[i] = util.correctLastCharCR(rec_list_2[i])                            
                            msg_no_analysis_helper(rec_list_2, corrected_nick, nick, conn_comp_list, conversations, today_conversation)                

                        if(flag_comma == 0):
                            rec = line[line.find(">")+1:line.find(", ")]
                            rec = rec[1:]
                            rec = util.correctLastCharCR(rec)
                            if(rec == nick):
                                if(corrected_nick != nick):                                   
                                    nick_receiver = nick_receiver_from_conn_comp(nick, conn_comp_list)        

            if DAY_BY_DAY_ANALYSIS:
                today_message_number_graph = nx.DiGraph()
                today_message_number_graph = message_no_add_egde(today_message_number_graph, today_conversation)                
                year, month, day = util.get_year_month_day(day_content)
                message_number_day_list.append([today_message_number_graph, year+'-'+month+'-'+day])

    print "\nBuilding graph object with EDGE WEIGHT THRESHOLD:", config.THRESHOLD_MESSAGE_NUMBER_GRAPH

    if not DAY_BY_DAY_ANALYSIS:
        aggregate_message_number_graph = message_no_add_egde(aggregate_message_number_graph, conversations)
        

    if config.DEBUGGER:
        print "========> 30 on " + str(len(conversations)) + " conversations"
        print conversations[:30]

    if DAY_BY_DAY_ANALYSIS:
        return message_number_day_list
    else:
        return aggregate_message_number_graph
def findResponseTime(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):
	nick_same_list=[[] for i in range(7000)]
	nicks = [] #list of all the nicknames
	conv = []
	conv_diff = []

	# out_dir_msg_num = output_directory+"RT/"
	out_dir_msg_num = output_directory
	if not os.path.exists(os.path.dirname(out_dir_msg_num)):
		try:
			os.makedirs(os.path.dirname(out_dir_msg_num))
		except OSError as exc: # Guard against race condition
			if exc.errno != errno.EEXIST:
				raise


	for folderiterator in range(startingMonth, endingMonth + 1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
				content = f.readlines() #contents stores all the lines of the file channel_name
					
			send_time = [] #list of all the times a user sends a message to another user
			nicks_for_the_day = []
			print(filePath)   
		
			#code for getting all the nicknames in a list
			for i in content:
				if(i[0] != '=' and "] <" in i and "> " in i):
					m = re.search(r"\<(.*?)\>", i)
					if m.group(0) not in nicks_for_the_day:                       
						nicks_for_the_day.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list

			for i in xrange(0,len(nicks_for_the_day)):
				if nicks_for_the_day[i][1:-1] not in nicks:
					nicks.append(nicks_for_the_day[i][1:-1])     #removed <> from the nicknames
				
			for i in xrange(0,len(nicks)):
				if(len(nicks[i])!=0):
						nicks[i]=correctLastCharCR(nicks[i])

			for j in content:
				if(j[0]=='=' and "changed the topic of" not in j):
					line1=j[j.find("=")+1:j.find(" is")]
					line2=j[j.find("wn as")+1:j.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					if(len(line1)!=0):
						line1=correctLastCharCR(line1)
						
					if(len(line2)!=0):
						line2=correctLastCharCR(line2)
						
					if line1 not in nicks:
						nicks.append(line1)
					if line2 not in nicks:
						nicks.append(line2)
		
			#code for forming list of lists for avoiding nickname duplicacy
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					line1=line[line.find("=")+1:line.find(" is")]
					line2=line[line.find("wn as")+1:line.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					if(len(line1)!=0):
						line1=correctLastCharCR(line1)
						
					if(len(line2)!=0):
						line2=correctLastCharCR(line2)
						
					for i in range(7000):
						if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
							if line1 in nick_same_list[i] and line2 not in nick_same_list[i]:
								nick_same_list[i].append(line2)
								break
							if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: 
								nick_same_list[i].append(line1)
								break
							if line2 in nick_same_list[i] and line1 in nick_same_list[i]:
								break  
						if not nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break

	for ni in nicks:
		for ind in range(7000):
			if ni in nick_same_list[ind]:
				break
			if not nick_same_list[ind]:
				nick_same_list[ind].append(ni)
				break

	G = to_graph(nick_same_list)
	L = connected_components(G)

	for i in range(1,len(L)+1):
		L[i-1] = [i]+L[i-1]

	graph_to_sir = []
	graph_x_axis = []
	graph_y_axis = []
	graphx1 =[]
	graphy1 =[]
	graphx2 =[]
	graphy2 =[]
	#2,3
	dateadd=-1
	
	for folderiterator in range(startingMonth, endingMonth + 1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name  dateadd=dateadd+1
			send_time = [] #list of all the times a user sends a message to another user
			meanstd_list = []
			totalmeanstd_list = []
			x_axis = []
			y_axis = []
			real_y_axis = []
			time_in_min = [[] for i in range(1000)]
			
			print(filePath)

			conversations=[[] for i in range(200)]   
			#code for making relation map between clients		
			for line in content:
				flag_comma = 0
				if(line[0] != '=' and "] <" in line and "> " in line):
					m = re.search(r"\<(.*?)\>", line)
					var = m.group(0)[1:-1]
					var=correctLastCharCR(var)
					for d in range(len(nicks)):
						if((d < len(L)) and (var in L[d])):
							nick_sender = L[d][0]
							break
			
					for i in nicks:
						rec_list=[e.strip() for e in line.split(':')]
						rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
						rec_list[1]=rec_list[1][1:]
						if not rec_list[1]:
							break
						for ik in xrange(0,len(rec_list)):
							if(rec_list[ik]):
								rec_list[ik]=correctLastCharCR(rec_list[ik])
						for z in rec_list:
							if(z==i):
								send_time.append(line[1:6])
								if(var != i):  
									for d in range(len(nicks)):
										if((d<len(L)) and (i in L[d])):
											nick_receiver=L[d][0]
											break
										
									for rt in xrange(0,200):
										if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]):
											conversations[rt].append(line[1:6])
											break
										if(len(conversations[rt])==0):
											conversations[rt].append(nick_sender)
											conversations[rt].append(nick_receiver)
											conversations[rt].append(line[1:6])
											break
							
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2=[e.strip() for e in rec_list[1].split(',')]
							for ij in xrange(0,len(rec_list_2)):
								if(rec_list_2[ij]):
									rec_list_2[ij]=correctLastCharCR(rec_list_2[ij])
									
							for j in rec_list_2:
								if(j==i):
									send_time.append(line[1:6])
									if(var != i):   
										for d in range(len(nicks)):
											if((d<len(L)) and (i in L[d])):
												nick_receiver=L[d][0]
												break
										
										for rt in xrange(0,200):
											if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]):
												conversations[rt].append(line[1:6]) 
												break
											if(len(conversations[rt])==0):
												conversations[rt].append(nick_sender)
												conversations[rt].append(nick_receiver)
												conversations[rt].append(line[1:6])
												break

						if(flag_comma == 0):
							rec=line[line.find(">")+1:line.find(", ")] 
							rec=rec[1:]
							rec=correctLastCharCR(rec)
							if(rec==i):
								send_time.append(line[1:6])
								if(var != i):
									for d in range(len(nicks)):
										if ((d<len(L)) and (i in L[d])):
											nick_receiver=L[d][0]
											break
									
									for rt in xrange(0,200):
										if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): 
											conversations[rt].append(line[1:6])
											break
										if(len(conversations[rt])==0):
											conversations[rt].append(nick_sender)
											conversations[rt].append(nick_receiver)
											conversations[rt].append(line[1:6])
											break
			
			for index in range(0,200):
				if(len(conversations[index])!=0):  
					for index1 in range(2,len(conversations[index])-1):
						conversations[index][index1]=(int(conversations[index][index1+1][0:2])*60+int(conversations[index][index1+1][3:5])) - (int(conversations[index][index1][0:2])*60+int(conversations[index][index1][3:5]))
	
			for index in range(0,200):
				if(len(conversations[index])!=0): 
					if(len(conversations[index])==3):
						conversations[index][2] = int(conversations[index][2][0:2])*60+int(conversations[index][2][3:5])     
					else: 
						del conversations[index][-1]

		#Explanation provided in parser-CL+CRT.py
			for index in range(0,200):
				if(len(conversations[index])!=0):
					for index1 in range(2,len(conversations[index])):
						totalmeanstd_list.append(conversations[index][index1])

			if(len(totalmeanstd_list)!=0):
				for iy in range(0, max(totalmeanstd_list)+1):
					x_axis.append(iy)

				for ui in x_axis:
					y_axis.append(float(totalmeanstd_list.count(ui))/float(len(totalmeanstd_list)))
				
				#finding the probability of each RT to occur=No. of occurence/total occurences.
				real_y_axis.append(y_axis[0])
				for ix in range(1, len(y_axis)):
					real_y_axis.append(float(real_y_axis[ix-1])+float(y_axis[ix]))
			
			#to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry.
			for hi in range(0,len(totalmeanstd_list)):
				graph_to_sir.append(totalmeanstd_list[hi])

			totalmeanstd_list.append(numpy.mean(totalmeanstd_list))
			totalmeanstd_list.append(numpy.mean(totalmeanstd_list)+2*numpy.std(totalmeanstd_list))
		
			for index in range(0,200):
				if(len(conversations[index])!=0):
					for index1 in range(2,len(conversations[index])):
						meanstd_list.append(conversations[index][index1])
					conversations[index].append(numpy.mean(meanstd_list))
					conversations[index].append(numpy.mean(meanstd_list)+(2*numpy.std(meanstd_list)))
					meanstd_list[:] = []

		#print("Conversation RT Info")
		#print(conversations)
	
		#print("Total Response-Time")
		#print(totalmeanstd_list)
		#print("\n\n")
		
#print("grpahs to graph_to_sir")  
#print(graph_to_sir)

	graph_to_sir.sort()
#print(graph_to_sir)

	for ti in range(0,graph_to_sir[len(graph_to_sir)-1]+1):
		graph_y_axis.append(graph_to_sir.count(ti))
		graph_x_axis.append(ti)

	# print(graph_y_axis)
#print(graph_x_axis)
#print(len(graph_y_axis))
#print(len(graph_x_axis))

#Finally storing the RT values along with their frequencies in a csv file. 
	rows = zip(graph_x_axis,graph_y_axis)
	filename=out_dir_msg_num+channel_name+"_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+"_RT.csv"
	with open(filename, 'a+') as myfile:
		wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
		for row in rows:
			wr.writerow(row)
def createAggregateGraph(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):

	MAX_EXPECTED_DIFF_NICKS = 5000

	nick_same_list=[[] for i in range(MAX_EXPECTED_DIFF_NICKS)]  

	conversations=[[] for i in range(MAX_EXPECTED_DIFF_NICKS)]   
	for i in xrange(0,MAX_EXPECTED_DIFF_NICKS):
		conversations[i].append(0)

	nicks = [] #list of all the nicknames
	aggregate_graph = nx.DiGraph()  #graph with multiple directed edges between clients used 

	if not os.path.exists(os.path.dirname(output_directory)):
		try:
			os.makedirs(os.path.dirname(output_directory))
		except OSError as exc: # Guard against race condition
			if exc.errno != errno.EEXIST:
				raise
				
	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   
		
			nicks_for_the_day = []
			print "Working on " + filePath 
			
			'''Getting all the nicknames in a list'''
			for i in content:
				if(i[0] != '=' and "] <" in i and "> " in i):
					m = re.search(r"\<(.*?)\>", i)
					if m.group(0) not in nicks_for_the_day:                       
						nicks_for_the_day.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list

			for i in xrange(0,len(nicks_for_the_day)):
				if nicks_for_the_day[i][1:-1] not in nicks:
					nicks.append(nicks_for_the_day[i][1:-1])     #removed <> from the nicknames
					
			for i in xrange(0,len(nicks)):
				nicks[i] = correctLastCharCR(nicks[i])
			
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					nick1=line[line.find("=")+1:line.find(" is")]
					nick2=line[line.find("wn as")+1:line.find("\n")]
					nick1=nick1[3:]
					nick2=nick2[5:]
					nick1=correctLastCharCR(nick1)
					nick2=correctLastCharCR(nick2)
					if nick1 not in nicks:
						nicks.append(nick1)
					if nick2 not in nicks:
						nicks.append(nick2)
				
			
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					line1=line[line.find("=")+1:line.find(" is")]
					line2=line[line.find("wn as")+1:line.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					line1=correctLastCharCR(line1)
					line2=correctLastCharCR(line2)
					for i in range(MAX_EXPECTED_DIFF_NICKS):
						if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
							if line1 in nick_same_list[i] and line2 not in nick_same_list[i]:
								nick_same_list[i].append(line2)
								break
							if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: 
								nick_same_list[i].append(line1)
								break
							if line2 in nick_same_list[i] and line1 in nick_same_list[i]:
								break  
						if not nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break

	for ni in nicks:
		for ind in range(MAX_EXPECTED_DIFF_NICKS):
			if ni in nick_same_list[ind]:
				break
			if not nick_same_list[ind]:
				nick_same_list[ind].append(ni)
				break

	G = to_graph(nick_same_list)
	L = connected_components(G)

	for i in range(1,len(L)+1):
		L[i-1] = [str(i)]+L[i-1]

	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
				content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   

			print(filePath)
			for line in content:
				flag_comma = 0
				if(line[0] != '=' and "] <" in line and "> " in line):
					m=re.search(r"\<(.*?)\>", line)
					var=m.group(0)[1:-1]
					var=correctLastCharCR(var)
					for d in range(MAX_EXPECTED_DIFF_NICKS):
						if ((d < len(L)) and (var in L[d])):  #change nick_same_list to L because L is the main list of all users and nicks now
							nick_sender = L[d][0]
							break
						
					for i in nicks:
						rec_list=[e.strip() for e in line.split(':')]
						rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
						rec_list[1]=rec_list[1][1:]
						if not rec_list[1]:
							break
						for k in xrange(0,len(rec_list)):
							if(rec_list[k]):
								rec_list[k]=correctLastCharCR(rec_list[k])
						for z in rec_list:
							if(z==i):
								if(var != i):  
									for d in range(MAX_EXPECTED_DIFF_NICKS):
										if ((d<len(L)) and (i in L[d])):
											nick_receiver=L[d][0]
											break
								
									for r in xrange(0,MAX_EXPECTED_DIFF_NICKS):
										if (nick_sender in conversations[r] and nick_receiver in conversations[r]):
											if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]):
												conversations[r][0]=conversations[r][0]+1
												break
										if(len(conversations[r])==1):
											conversations[r].append(nick_sender)
											conversations[r].append(nick_receiver)
											conversations[r][0]=conversations[r][0]+1
											break
								
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2=[e.strip() for e in rec_list[1].split(',')]
							for ij in xrange(0,len(rec_list_2)):                       #changed variable from i to ij as i has been used above. We are in nested for loop. Same variables name will overlap.
								if(rec_list_2[ij]):
									rec_list_2[ij] = correctLastCharCR(rec_list_2[ij])
							for j in rec_list_2:
								if(j==i):
									if(var != i):  
										for d in range(MAX_EXPECTED_DIFF_NICKS):
											if i in L[d]:
												nick_receiver=L[d][0]
												break
												
										for r in xrange(0,MAX_EXPECTED_DIFF_NICKS):
											if (nick_sender in conversations[r] and nick_receiver in conversations[r]):
												if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]):
													conversations[r][0]=conversations[r][0]+1
													break
											if(len(conversations[r])==1):
												conversations[r].append(nick_sender)
												conversations[r].append(nick_receiver)
												conversations[r][0]=conversations[r][0]+1
												break

						if(flag_comma == 0):
							rec=line[line.find(">")+1:line.find(", ")]
							rec=rec[1:]
							rec = correctLastCharCR(rec) 
							if(rec==i):
								if(var != i):
									for d in range(MAX_EXPECTED_DIFF_NICKS):
										if i in L[d]:
											nick_receiver=L[d][0]
											break
											
									for r in xrange(0,MAX_EXPECTED_DIFF_NICKS):
										if (nick_sender in conversations[r] and nick_receiver in conversations[r]): 
											if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]):
												conversations[r][0]=conversations[r][0]+1
												break
										if(len(conversations[r])==1):
											conversations[r].append(nick_sender)
											conversations[r].append(nick_receiver)
											conversations[r][0]=conversations[r][0]+1
											break

			for index in xrange(0,MAX_EXPECTED_DIFF_NICKS):
				if(len(conversations[index])==3):
					aggregate_graph.add_edge(conversations[index][1],conversations[index][2],weight=conversations[index][0])  

	# print("========> nicks")
	# print(nicks)
	# print("========> nick_same_list")
	# print(nick_same_list)
	# print("========> conversations")
	# print(conversations)
	
	for u,v,d in aggregate_graph.edges(data=True):
		d['label'] = d.get('weight','')

	output_file=output_directory+channel_name+"_2013_"+str(startingMonth)+"_"+str(endingMonth)+"_aggregategraph.png"
	print "Generating "+output_file
	print "Please wait ...."

	A = nx.to_agraph(aggregate_graph)
	A.layout(prog='dot')
	A.draw(output_file)
	print("Done Generating")
def createMessageTimeGraph(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):
	# out_dir_msg_time = output_directory+"message-time/"
	out_dir_msg_time = output_directory
	if not os.path.exists(os.path.dirname(out_dir_msg_time)):
		try:
			os.makedirs(os.path.dirname(out_dir_msg_time))
		except OSError as exc: # Guard against race condition
			if exc.errno != errno.EEXIST:
				raise

	rem_time= None #remembers the time of the last message of the file parsed before the current file
	nick_same_list=[[] for i in range(5000)]  #x
	nicks = [] #list of all the nicknames     

	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   
			
			nicks_for_the_day = []  
			'''
				Getting all the nicknames in a list nicks[]
			'''
			for i in content:
				if(i[0] != '=' and "] <" in i and "> " in i):
					m = re.search(r"\<(.*?)\>", i)
					if m.group(0) not in nicks_for_the_day:                       
						nicks_for_the_day.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list

			for i in xrange(0,len(nicks_for_the_day)):
				if nicks_for_the_day[i][1:-1] not in nicks:
					nicks.append(nicks_for_the_day[i][1:-1])     #removed <> from the nicknames
					
			for i in xrange(0,len(nicks)):
				nicks[i] = correctLastCharCR(nicks[i])
			
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					nick1=line[line.find("=")+1:line.find(" is")]
					nick2=line[line.find("wn as")+1:line.find("\n")]
					nick1=nick1[3:]
					nick2=nick2[5:]
					nick1=correctLastCharCR(nick1)
					nick2=correctLastCharCR(nick2)
					if nick1 not in nicks:
						nicks.append(nick1)
					if nick2 not in nicks:
						nicks.append(nick2)

			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					line1=line[line.find("=")+1:line.find(" is")]
					line2=line[line.find("wn as")+1:line.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					line1=correctLastCharCR(line1)
					line2=correctLastCharCR(line2)
					for i in range(5000):
						if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
							if line1 in nick_same_list[i] and line2 not in nick_same_list[i]:
								nick_same_list[i].append(line2)
								break
							if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: 
								nick_same_list[i].append(line1)
								break
							if line2 in nick_same_list[i] and line1 in nick_same_list[i]:
								break  
						if not nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break

	for ni in nicks:
		for ind in range(5000):
			if ni in nick_same_list[ind]:
				break
			if not nick_same_list[ind]:
				nick_same_list[ind].append(ni)
				break

	G = to_graph(nick_same_list)
	L = connected_components(G)

	for i in range(1,len(L)+1):
		L[i-1] = [str(i)]+L[i-1]

	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   

			'''=========================== Plotting the conversation graph =========================== '''
			graph_conversation = nx.MultiDiGraph()  #graph with multiple directed edges between clients used
			for line in content:
				flag_comma = 0
				if(line[0] != '=' and "] <" in line and "> " in line):
					m = re.search(r"\<(.*?)\>", line)
					var = m.group(0)[1:-1]
					var = correctLastCharCR(var)
					for d in range(5000):
						if ((d < len(L)) and (var in L[d])): 
							nick_sender = L[d][0]
							break

					for i in nicks:
						rec_list=[e.strip() for e in line.split(':')] #receiver list splited about :
						rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
						rec_list[1]=rec_list[1][1:]
						if not rec_list[1]: #index 0 will contain time 14:02
							break
						for k in xrange(0,len(rec_list)):
							if(rec_list[k]): #checking for \
								rec_list[k] = correctLastCharCR(rec_list[k])
						for z in rec_list:
							if(z==i):
								if(var != i):  
									for d in range(5000):
										if ((d < len(L)) and (i in L[d])): 
											nick_receiver=L[d][0]
											break
									graph_conversation.add_edge(nick_sender,nick_receiver,weight=line[1:6])  
								
						if "," in rec_list[1]: #receiver list may of the form <Dhruv> Rohan, Ram :
							flag_comma = 1
							rec_list_2=[e.strip() for e in rec_list[1].split(',')]
							for y in xrange(0,len(rec_list_2)):
								if(rec_list_2[y]): #checking for \
									rec_list_2[y]=correctLastCharCR(rec_list_2[y])
							for j in rec_list_2:
								if(j==i):
									if(var != i):   
										for d in range(5000):
											if i in L[d]:
												nick_receiver=L[d][0]
												break
										graph_conversation.add_edge(nick_sender,nick_receiver,weight=line[1:6])   

						if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi!
							rec=line[line.find(">")+1:line.find(", ")] 
							rec=rec[1:]
							rec=correctLastCharCR(rec)
							if(rec==i):
								if(var != i):
									for d in range(5000):
										if i in L[d]:
											nick_receiver=L[d][0]
											break
									graph_conversation.add_edge(nick_sender,nick_receiver,weight=line[1:6])  
							
			for u,v,d in graph_conversation.edges(data=True):
				d['label'] = d.get('weight','')
			output_file=out_dir_msg_time+channel_name+"_2013_"+str(folderiterator)+"_"+str(fileiterator)+"_msg_time.png"
			print "Generated " + output_file
			A = nx.to_agraph(graph_conversation)
			A.layout(prog='dot')
			A.draw(output_file)
                gf_idx[t].append(newfamily)
            else:
                gf_idx[t] = [newfamily]
    verbalise("Y", "Results after merging based on sequence identity:")
    report(gene_families, 5, verbalise=verbalise)

    #############################################################################
    # merge gene families based on Trinity gene groups (solution from SO):
    G = to_graph(gene_families)

    # collect new gene families:
    trinity_pool = {}
    gf_idx = {} # reset the gene family index to the new groups
    count = 0

    for group in connected_components(G):
        count += 1
        ts = []
        for gid in group:
            ts += geneid_idx[gid]
        newgf = Gene_family(ts)

        # set index to find gf for given transcript:
        for t in newgf:
            gf_idx[t.td_id] = newgf

        trinity_pool[newgf] = True

    # report results:
    verbalise("Y", "\n\nResults after merging based on Trinity gene assignment:")
    report(trinity_pool, 5, verbalise=verbalise)
def createMessageNumberGraph(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):
	nick_same_list=[[] for i in range(5000)] #list of list with each list having all the nicks for that particular person
	nicks = [] #list of all the nicknames

	# out_dir_msg_num = output_directory+"number-of-messages/"
	out_dir_msg_num = output_directory
	if not os.path.exists(os.path.dirname(out_dir_msg_num)):
		try:
			os.makedirs(os.path.dirname(out_dir_msg_num))
		except OSError as exc: # Guard against race condition
			if exc.errno != errno.EEXIST:
				raise

	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
				content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   
		
			nicks_for_the_day = []
			print "Working on " + filePath 
			
			'''Getting all the nicknames in a list'''
			for i in content:
				if(i[0] != '=' and "] <" in i and "> " in i):
					m = re.search(r"\<(.*?)\>", i)
					if m.group(0) not in nicks_for_the_day:                       
						nicks_for_the_day.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list

			for i in xrange(0,len(nicks_for_the_day)):
				if nicks_for_the_day[i][1:-1] not in nicks:
					nicks.append(nicks_for_the_day[i][1:-1])     #removed <> from the nicknames
					
			for i in xrange(0,len(nicks)):
				nicks[i] = correctLastCharCR(nicks[i])
			
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					nick1=line[line.find("=")+1:line.find(" is")]
					nick2=line[line.find("wn as")+1:line.find("\n")]
					nick1=nick1[3:]
					nick2=nick2[5:]
					nick1=correctLastCharCR(nick1)
					nick2=correctLastCharCR(nick2)
					if nick1 not in nicks:
						nicks.append(nick1)
					if nick2 not in nicks:
						nicks.append(nick2)
				
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					line1=line[line.find("=")+1:line.find(" is")]
					line2=line[line.find("wn as")+1:line.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					line1=correctLastCharCR(line1)
					line2=correctLastCharCR(line2)
					for i in range(5000):
						if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
							if line1 in nick_same_list[i] and line2 not in nick_same_list[i]:
								nick_same_list[i].append(line2)
								break
							if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: 
								nick_same_list[i].append(line1)
								break
							if line2 in nick_same_list[i] and line1 in nick_same_list[i]:
								break  
						if not nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break

	for ni in nicks:
		for ind in range(5000):
			if ni in nick_same_list[ind]:
				break
			if not nick_same_list[ind]:
				nick_same_list[ind].append(ni)
				break

	G = to_graph(nick_same_list)
	L = connected_components(G)

	for i in range(1,len(L)+1):
		L[i-1] = [str(i)]+L[i-1]

	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
				content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   

			print(filePath) 
			conversations=[[] for i in range(100)]   #xarr
			for i in xrange(0,100):
				conversations[i].append(0)
			
			for line in content:
				flag_comma = 0
				if(line[0] != '=' and "] <" in line and "> " in line):
					m = re.search(r"\<(.*?)\>", line)
					var = m.group(0)[1:-1]
					var = correctLastCharCR(var) 
					for d in range(len(nicks)):
						if((d < len(L)) and (var in L[d])):
							nick_sender = L[d][0]
							break
							
					for i in nicks:
						rec_list=[e.strip() for e in line.split(':')]
						rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
						rec_list[1]=rec_list[1][1:]
						if not rec_list[1]:
							break
						for x in xrange(0,len(rec_list)):
							if(rec_list[x]):
								rec_list[x] = correctLastCharCR(rec_list[x])
						for z in rec_list:
							if(z==i):
								if(var != i):  
									for d in range(len(nicks)):
										if((d<len(L)) and (i in L[d])):
											nick_receiver=L[d][0]
											break

									for k in xrange(0,100):
										if (nick_sender in conversations[k] and nick_receiver in conversations[k]):
											if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]):
												conversations[k][0]=conversations[k][0]+1
												break
										if(len(conversations[k])==1):
											conversations[k].append(nick_sender)
											conversations[k].append(nick_receiver)
											conversations[k][0]=conversations[k][0]+1
											break
								
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2=[e.strip() for e in rec_list[1].split(',')]
							for y in xrange(0,len(rec_list_2)):
								if(rec_list_2[y]):
									rec_list_2[y] = correctLastCharCR(rec_list_2[y])
							for j in rec_list_2:
								if(j==i):
									if(var != i):   
										for d in range(len(nicks)):
											if i in L[d]:
												nick_receiver=L[d][0]
												break
												
										for k in xrange(0,100):
											if (nick_sender in conversations[k] and nick_receiver in conversations[k]):
												if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]):
													conversations[k][0]=conversations[k][0]+1
													break
											if(len(conversations[k])==1):
												conversations[k].append(nick_sender)
												conversations[k].append(nick_receiver)
												conversations[k][0]=conversations[k][0]+1
												break

						if(flag_comma == 0):
							rec=line[line.find(">")+1:line.find(", ")][1:]
							rec = correctLastCharCR(rec)
							if(rec==i):
								if(var != i):
									for d in range(len(nicks)):
										if i in L[d]:
											nick_receiver=L[d][0]
											break
										
									for k in xrange(0,100):
										if (nick_sender in conversations[k] and nick_receiver in conversations[k]):  
											if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]):
												conversations[k][0]=conversations[k][0]+1
												break
										if(len(conversations[k])==1):
											conversations[k].append(nick_sender)
											conversations[k].append(nick_receiver)
											conversations[k][0]=conversations[k][0]+1
											break
		
			msg_num_graph = nx.DiGraph()  #graph with multiple directed edges between clients used 

			for y in xrange(0,100):
				if(len(conversations[y])==3):
					msg_num_graph.add_edge(conversations[y][1],conversations[y][2],weight=conversations[y][0])   

			for u,v,d in msg_num_graph.edges(data=True):
							d['label'] = d.get('weight','')
			output_file=out_dir_msg_num+channel_name+"_2013_"+str(folderiterator)+"_"+str(fileiterator)+"_msg_num.png"
			print "Generated " + output_file
			A = nx.drawing.nx_agraph.to_agraph(msg_num_graph)
			A.layout(prog='dot')
			A.draw(output_file)
Beispiel #51
0
	def gcc_size( self ):
		cc_sizes = list()
		for cc in connected_components( self._network ):
			cc_sizes.append( len(cc))
		return max( cc_sizes )
def createGephiTimelapseCSV(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):
	"""[Deprecated]
	Produces node and edge csv files that contain information relevant for creating a timelapse of user interactions on Gephi. Most importantly, these csv files contain the node/edge appear and disappear times and can easily be imported into Gephi.
	"""
	today1= []
	today2 =[]
	col1 = []
	col2 = []
	col3 = []
	col4 = []
	col_edge1 = []
	col_edge2 = []
	col_edge3 = []
	col_edge4 = []
	col_edge5 = []
	col_edge6 = []

	for i in range(0,365):
		dt1= datetime.datetime.now() - datetime.timedelta(days=3*365 + 105 - i)

		#We are basically trying to append all the 365 dates into today1 and today2. This is dependent on the day you are writing this code. 105 means
		#I was writing the code after 105 days from 1 Jan 2016. For making the Gephi timelapse work, we need to get the times in the dd-mm-yy h:m:s format
		# My advice would be to try Gephi yourself first and then come back to the code.
		today1.append(dt1.strftime('%Y-%m-%d'))
		dt2= datetime.datetime.now() - datetime.timedelta(days=3*365 + 105 - i) 
	
		today2.append(dt2.strftime('%Y-%m-%d'))

	#for xv in range(len(today1)):        #If we want our graphs to simply change on a daily basis then we can append 00:00:01 to appear time. This means that irrespective
	#today1[xv]= today1[xv] + " 00:00:01" #of when an edge appears on a particular day, the edge will appear throughout 00:00:01 to 23:59:59 of that day.
	# But from the log files we know when an edge appears, so we will use that to make out timelapse more meaningful and precise. 
	for xz in range(len(today2)):
		today2[xz]= today2[xz] + " 23:59:59" #However once an edge appears, it only disappears when the day ends i.e. 23:59:59. We can change this as per our wish.

	nick_same_list=[[] for i in range(5000)]

	nicks = [] #list of all the nicknames
	my_sum = 0

	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   
	
			send_time = [] #list of all the times a user sends a message to another user
			nicks_for_the_day = []
			
			print(filePath+ "For Nicks")
		
			#code for getting all the nicknames in a list
			for i in content:
				if(i[0] != '=' and "] <" in i and "> " in i):
					m = re.search(r"\<(.*?)\>", i)
					if m.group(0) not in nicks_for_the_day:                       
						nicks_for_the_day.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list

			for i in xrange(0,len(nicks_for_the_day)):
				if nicks_for_the_day[i][1:-1] not in nicks:
					nicks.append(nicks_for_the_day[i][1:-1])     #removed <> from the nicknames
				
			for i in xrange(0,len(nicks)):
				if(len(nicks[i])!=0):
						nicks[i]=correctLastCharCR(nicks[i])

			for j in content:
				if(j[0]=='=' and "changed the topic of" not in j):
					line1=j[j.find("=")+1:j.find(" is")]
					line2=j[j.find("wn as")+1:j.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					if(len(line1)!=0):
						line1=correctLastCharCR(line1)
						
					if(len(line2)!=0):
						line2=correctLastCharCR(line2)
					if line1 not in nicks:
						nicks.append(line1)
					if line2 not in nicks:
						nicks.append(line2)
			
			#code for forming list of lists for avoiding nickname duplicacy
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					line1=line[line.find("=")+1:line.find(" is")]
					line2=line[line.find("wn as")+1:line.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					if(len(line1)!=0):
						line1=correctLastCharCR(line1)
						
					if(len(line2)!=0):
						line2=correctLastCharCR(line2)
					for i in range(5000):
						if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
							if line1 in nick_same_list[i] and line2 not in nick_same_list[i]:
								nick_same_list[i].append(line2)
								break
							if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: 
								nick_same_list[i].append(line1)
								break
							if line2 in nick_same_list[i] and line1 in nick_same_list[i]:
								break  
						if not nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break
	#print(x)  
	for ni in nicks:
		for ind in range(5000):
			if ni in nick_same_list[ind]:
				break
			if not nick_same_list[ind]:
				nick_same_list[ind].append(ni)
				break

	#print("*********************x**********************************")
	#print(nick_same_list)

	G = to_graph(nick_same_list)
	L = list(connected_components(G))

	for i in range(1,len(L)+1):
                L[i-1] = list(L[i-1])

	#The explanation for the aforementioned code has already been given in parser-RT.py.

	#Lines 190-233 are for the nodes in Gephi. We make sure that all nodes are always present throughout our timelapse. For this reason the appear
	#and disappear times for all the nodes are 1 Jan 00:00:01 to 1 Feb 00:00:01 (our timelapse is for a month). If we want our nodes to appear or disappear at some other times
	# change their values here itself. Our aim is to store all these tables in a csv file. Gephi imports these node and edge csv 
	#files, parses them and produces the timelapse.
	createvar = -1
	
	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   
			createvar = createvar + 1
			nicks_for_the_day_2 = []
			print(filePath+ "For Nodes")
			
			#code for getting all the nicknames in a list
			for il in content:
				if(il[0] != '=' and "] <" in il and "> " in il):
					m = re.search(r"\<(.*?)\>", il)
					if m.group(0) not in nicks_for_the_day_2:                       
						nicks_for_the_day_2.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list
				
			for nx in nicks_for_the_day_2:
				for dk in range(len(nicks)):
					if (dk<len(L) and nx[1:-1] in L[dk]):
						col1.append(str(L[dk][0]))
						col2.append(str(L[dk][0]))
						#col3.append(today1[createvar])
						#col4.append(today2[createvar])
						col3.append("2013-01-01 00:00:01") #Nodes stay throughout the month. Col3 and Col4 are the appear and disappear times. Hardcode them ;)
						col4.append("2013-02-01 00:00:01")
						break
		
	rows = zip(col1,col2,col3,col4)   	# We store everything in a csv file which we will later import to Gephi. We are Gephi's slaves!
	with open('/home/dhruvie/LOP/nodesgephi_unchained.csv', 'a+') as myfile:
					wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
					for ro in rows:
						wr.writerow(ro)
					
	#The below code similarly is for obtaining the edge csv file which we is imported to gephi later. The difference here
	#is that we know when the edges appear. We obtain this from the log file(line[1:6]), we just append :00 to satisfy the hh:mm:ss format, Gephi wants.

	createvar_used=-1 
	
	for folderiterator in range(startingMonth, endingMonth+1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name                             #contents stores all the lines of the file kubunutu-devel   
			createvar_used=createvar_used+1    
			print(filePath+ "For Edges")
			nickplots = [] #nickplots stores all those nicks from nicks[] list that do not have zero in and outdegree in our conversation graph
			indegree = [] #this list stores all the indegree corresponding to a particular nick
			outdegree = [] #this list stores all the outdegree corresponding to a particular nick
		
	#  G1 = nx.MultiDiGraph()  
			for line in content:
				flag_comma = 0
				if(line[0] != '=' and "] <" in line and "> " in line):
					m = re.search(r"\<(.*?)\>", line)
					var = m.group(0)[1:-1]
					var=correctLastCharCR(var)
					for d in range(len(nicks)):
						if (d<len(L) and var in L[d]):
							nick_sender = L[d][0]
							break
						
					for i in nicks:
						rec_list=[e.strip() for e in line.split(':')]
						rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
						rec_list[1]=rec_list[1][1:]
						if not rec_list[1]:
							break
						for ik in xrange(0,len(rec_list)):
							if(rec_list[ik]):
								rec_list[ik]=correctLastCharCR(rec_list[ik])
						for z in rec_list:
							if(z==i):
								send_time.append(line[1:6])
								if(var != i): 	
									for d in range(len(nicks)):
										if i in L[d]:
											nick_receiver=L[d][0]
											break
									col_edge1.append(nick_sender)
									col_edge2.append(nick_receiver)			#We are going to add all these columns in a csv file for Gephi's use.
									col_edge3.append("Directed")
									col_edge4.append(1.0)
									col_edge5.append(today1[createvar_used]+" "+line[1:6]+":00")
									col_edge6.append(today2[createvar_used])
									my_sum=my_sum+1

		#       G1.add_edge(nick_sender,nick_receiver,weight=line[1:6])
									if nick_sender not in nickplots:
										nickplots.append(nick_sender)	 
									if nick_receiver not in nickplots:	#Right time to append to nickplots.
										nickplots.append(nick_receiver) 
							
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2=[e.strip() for e in rec_list[1].split(',')]
							for ij in xrange(0,len(rec_list_2)):
								if(rec_list_2[ij]):
									rec_list_2[ij]=correctLastCharCR(rec_list_2[ij])
							for j in rec_list_2:
								if(j==i):
									send_time.append(line[1:6])
									if(var != i): 	
										for d in range(len(nicks)):
											if i in L[d]:
												nick_receiver=L[d][0]
												break
										col_edge1.append(nick_sender)
										col_edge2.append(nick_receiver)
										col_edge3.append("Directed")
										col_edge4.append(1.0)
										col_edge5.append(today1[createvar_used]+" "+line[1:6]+":00")
										col_edge6.append(today2[createvar_used])           	  
										my_sum=my_sum+1
									# G1.add_edge(nick_sender,nick_receiver,weight=line[1:6])	 
										if nick_sender not in nickplots:
											nickplots.append(nick_sender)   
										if nick_receiver not in nickplots:
											nickplots.append(nick_receiver) 

						if(flag_comma == 0):
							rec=line[line.find(">")+1:line.find(", ")] 
							rec=rec[1:]
							rec=correctLastCharCR(rec)
							if(rec==i):
								send_time.append(line[1:6])
								if(var != i):
									for d in range(len(nicks)):
										if i in L[d]:
											nick_receiver=L[d][0]
											break
									col_edge1.append(nick_sender)
									col_edge2.append(nick_receiver)
									col_edge3.append("Directed")
									col_edge4.append(1.0)
									col_edge5.append(today1[createvar_used]+" "+line[1:6]+":00")
									col_edge6.append(today2[createvar_used]) 
									my_sum=my_sum+1
								# G1.add_edge(nick_sender,nick_receiver,weight=line[1:6])	 
									if nick_sender not in nickplots:
										nickplots.append(nick_sender)   
									if nick_receiver not in nickplots:
										nickplots.append(nick_receiver) 
						
	edge_rows = zip(col_edge1,col_edge2,col_edge3,col_edge4,col_edge5,col_edge6)   
	with open('/home/dhruvie/LOP/edgesgephi_unchained.csv', 'a+') as myfile:
					wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
					for rz in edge_rows:
						wr.writerow(rz)  
  #throughout a year.


for ni in nicks:
 for ind in range(7000):
  if ni in x[ind]:
   break
  if not x[ind]:
   x[ind].append(ni)
   break

#Append all the nicks from nicks array which are not already present in x. These would be the ones who did not change their nicks throughout the
#year.

G = to_graph(x) #function definition is on top
L = connected_components(G)

 

for i in range(1,len(L)+1):
 L[i-1] = [i]+L[i-1]

 # We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g. 
 #Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user.

xarr=[[] for i in range(10000)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too.
graph_to_sir = []                ## I would advice on using a different data structure which does not have an upper bound like we do in arrays. 
graph_x_axis = []
graph_y_axis = []
graphx1 =[]
graphy1 =[]
def findConvLength_ConvRefreshTime(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):

	nick_same_list=[[] for i in range(7000)]
	nicks = [] #list of all the nicknames
	conv = []
	conv_diff = []
	
	# out_dir_msg_num = output_directory+"CL/"
	out_dir_msg_num = output_directory
	if not os.path.exists(os.path.dirname(out_dir_msg_num)):
		try:
			os.makedirs(os.path.dirname(out_dir_msg_num))
		except OSError as exc: # Guard against race condition
			if exc.errno != errno.EEXIST:
				raise

	for folderiterator in range(startingMonth, endingMonth + 1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name
					
			send_time = [] #list of all the times a user sends a message to another user
			nicks_for_the_day = []
			
			print(filePath)   
			#code for getting all the nicknames in a list
			for i in content:
				if(i[0] != '=' and "] <" in i and "> " in i):
					m = re.search(r"\<(.*?)\>", i)
					if m.group(0) not in nicks_for_the_day:                       
						nicks_for_the_day.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list

			for i in xrange(0,len(nicks_for_the_day)):
				if nicks_for_the_day[i][1:-1] not in nicks:
					nicks.append(nicks_for_the_day[i][1:-1])     #removed <> from the nicknames
				
			for i in xrange(0,len(nicks)):
				if(len(nicks[i])!=0):
						nicks[i]=correctLastCharCR(nicks[i])

			for j in content:
				if(j[0]=='=' and "changed the topic of" not in j):
					line1=j[j.find("=")+1:j.find(" is")]
					line2=j[j.find("wn as")+1:j.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					if(len(line1)!=0):
						line1=correctLastCharCR(line1)
						
					if(len(line2)!=0):
						line2=correctLastCharCR(line2)
						
					if line1 not in nicks:
						nicks.append(line1)
					if line2 not in nicks:
						nicks.append(line2)
			
			#code for forming list of lists for avoiding nickname duplicacy
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					line1=line[line.find("=")+1:line.find(" is")]
					line2=line[line.find("wn as")+1:line.find("\n")]
					line1=line1[3:]
					line2=line2[5:]
					if(len(line1)!=0):
						line1=correctLastCharCR(line1)
						
					if(len(line2)!=0):
						line2=correctLastCharCR(line2)
						
					for i in range(7000):
						if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
							if line1 in nick_same_list[i] and line2 not in nick_same_list[i]:
								nick_same_list[i].append(line2)
								break
							if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: 
								nick_same_list[i].append(line1)
								break
							if line2 in nick_same_list[i] and line1 in nick_same_list[i]:
								break  
						if not nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break

	for ni in nicks:
		for ind in range(7000):
			if ni in nick_same_list[ind]:
				break
			if not nick_same_list[ind]:
				nick_same_list[ind].append(ni)
				break

	G = to_graph(nick_same_list)
	L = connected_components(G)

	for i in range(1,len(L)+1):
		L[i-1] = [i]+L[i-1]

	# We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g. 
	#Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user.

	conversations=[[] for i in range(10000)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too.
	graph_to_sir = []                ## I would advice on using a different data structure which does not have an upper bound like we do in arrays. 
	graph_x_axis = []
	graph_y_axis = []
	graphx1 =[]
	graphy1 =[]
	graphx2 =[]
	graphy2 =[]

	dateadd=-1 #Variable used for response time calculation. Varies from 0-365.
	for folderiterator in range(startingMonth, endingMonth + 1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name  
			dateadd=dateadd+1
			send_time = [] #list of all the times a user sends a message to another user
			meanstd_list = []
			totalmeanstd_list = []
			x_axis = []
			y_axis = []
			real_y_axis = []
			time_in_min = [[] for i in range(1000)]
			
			print(filePath)

			#code for making relation map between clients		
			for line in content:
				flag_comma = 0
				if(line[0] != '=' and "] <" in line and "> " in line):
					m = re.search(r"\<(.*?)\>", line)
					var = m.group(0)[1:-1]
					var=correctLastCharCR(var)
					for d in range(len(nicks)):                              #E.g. if names are rohan1,rohan2,rohan3...,then var will store rohan1.
						if((d < len(L)) and (var in L[d])):
							nick_sender = L[d][0]
							break
						
					for i in nicks:
						rec_list=[e.strip() for e in line.split(':')]
						rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
						rec_list[1]=rec_list[1][1:]
						if not rec_list[1]:
							break
						for ik in xrange(0,len(rec_list)):
							if(rec_list[ik]):
								rec_list[ik]=correctLastCharCR(rec_list[ik])

						for z in rec_list:
							if(z==i):
								send_time.append(line[1:6])
								if(var != i): 	
									for d in range(len(nicks)):
										if((d<len(L)) and (i in L[d])):
											nick_receiver=L[d][0]
											break
										
									for rt in xrange(0,10000):
										if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]):
											conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) # We add response times in conversations for every conversation 
											break                                                                     #between userA and userB. If they havent already conversed 
										if(len(conversations[rt])==0):                                            #before than add time at a new array index and later append to it.
											conversations[rt].append(nick_sender)
											conversations[rt].append(nick_receiver)
											conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5]))
											break
							
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2=[e.strip() for e in rec_list[1].split(',')]
							for ij in xrange(0,len(rec_list_2)):
								if(rec_list_2[ij]):
									rec_list_2[ij]=correctLastCharCR(rec_list_2[ij])

							for j in rec_list_2:
								if(j==i):
									send_time.append(line[1:6])
									if(var != i): 	
										for d in range(len(nicks)):
											if((d<len(L)) and (i in L[d])):   #Lines 212-255 consider all cases in which messages are addressed such as - nick1:nick2 or nick1,nick2,
												nick_receiver=L[d][0]                   #or nick1,nick2:
												break
										
										for rt in xrange(0,10000):
											if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]):
												conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) 
												break
											if(len(conversations[rt])==0):
												conversations[rt].append(nick_sender)
												conversations[rt].append(nick_receiver)
												conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5]))
												break

						if(flag_comma == 0):
							rec=line[line.find(">")+1:line.find(", ")] 
							rec=rec[1:]
							rec=correctLastCharCR(rec)
								
							if(rec==i):
								send_time.append(line[1:6])
								if(var != i):
									for d in range(len(nicks)):
										if ((d<len(L)) and (i in L[d])):
											nick_receiver=L[d][0]
											break
									
									for rt in xrange(0,10000):
										if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]):	
											conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5]))
											break
										if(len(conversations[rt])==0):
											conversations[rt].append(nick_sender)
											conversations[rt].append(nick_receiver)
											conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5]))
											break
		
	#Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year.

	for ty in range(0,len(conversations)):       #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs) 
		if(len(conversations[ty])!=0):              # response times are calculated starting from index 2. So now we have all the response times in conversations.
			del conversations[ty][0:2]

	for fg in range(0,len(conversations)):
		if(len(conversations[fg])!=0):
			first=conversations[fg][0]
			for gh in range(1,len(conversations[fg])):
					if(conversations[fg][gh]-conversations[fg][gh-1]>9):
					
						conv.append(conversations[fg][gh-1]-first)    #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response
																																										#time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code.
						conv_diff.append(conversations[fg][gh]-conversations[fg][gh-1])
						first=conversations[fg][gh]
					if(gh==(len(conversations[fg])-1)):
						conv.append(conversations[fg][gh]-first)					
						break

	for op in range(0,max(conv)):
		graphx1.append(op)
		graphy1.append(conv.count(op))

	for po in range(0,max(conv_diff)):
		graphx2.append(po)
		graphy2.append(conv_diff.count(po))

#To plot CDF we store the CL and CRT values and their number of occurences as shown above.

	row_cl = zip(graphx1,graphy1)
	filename1= out_dir_msg_num+channel_name+"_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+"_CL.csv"
	with open(filename1, 'a+') as myfile:
					wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
					for row in row_cl:
						wr.writerow(row)

	row_crt = zip(graphx2,graphy2)
	filename2= out_dir_msg_num+channel_name+"_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+"_CRT.csv"
	with open(filename2, 'a+') as myfile:
					wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
					for row in row_crt:
						wr.writerow(row)