def _get_h1_tags_to_merge(related_pairs): l = copy.deepcopy(related_pairs) def to_graph(l): G = networkx.Graph() for part in l: # each sublist is a bunch of nodes G.add_nodes_from(part) # it also imlies a number of edges: G.add_edges_from(to_edges(part)) return G def to_edges(l): """ treat `l` as a Graph and returns it's edges to_edges(['a','b','c','d']) -> [(a,b), (b,c),(c,d)] """ it = iter(l) last = next(it) for current in it: yield last, current last = current G = to_graph(l) return [list(l) for l in connected_components(G)]
def get_connections(coordinates, distance=2, tree_type='KDTree'): """Get spots which are conditionally connected Parameters ---------- coordinates : array of tuples distance : float tree_type : str Returns ------- """ if tree_type == 'KDTree': nn_points = kdtree_clustering(coordinates=coordinates, distance=distance) graph_tree = to_graph(nn_points) return sorted(connected_components(graph_tree), key=len, reverse=True) else: # USE cKDTree to determine connected cyto+ spots with a distance of d <= 2 conneted_spots = get_connectedspots(coordinates=np.array(coordinates), distance=1.5) lines = [[np.array(coordinates)[i], np.array(coordinates)[j]] for i, j in conneted_spots] return conneted_spots, lines
def getCIGroups(local_data, ds_context=None, scope=None, families=None): """ :param local_data: np array :param scope: a list of index to output variables :param alpha: threshold :param families: obsolete :return: np array of clustering This function take tuple (output, conditional) as input and returns independent groups alpha is the cutoff parameter for connected components BE CAREFUL WITH SPARSE DATA! """ # data = preproc(local_data, ds_context, None, ohe) y, x = get_YX(local_data, ds_context.feature_size) pvals = testRcoT(y, x) + epsilon pvals[pvals > alpha] = 0 clusters = np.zeros(y.shape[1]) for i, c in enumerate(connected_components(from_numpy_matrix(pvals))): clusters[list(c)] = i + 1 return split_conditional_data_by_clusters(y, x, clusters, scope, rows=False)
def getCIGroups(local_data, ds_context=None, scope=None, alpha=0.001, families=None): """ :param local_data: np array :param scope: a list of index to output variables :param alpha: threshold :param families: obsolete :return: np array of clustering This function take tuple (output, conditional) as input and returns independent groups alpha is the cutoff parameter for connected components BE CAREFUL WITH SPARSE DATA! """ data = preproc(local_data, ds_context, None, ohe) num_instance = data.shape[0] output_mask = np.zeros(data.shape, dtype=bool) # todo check scope and node.scope again output_mask[:, np.arange(len(scope))] = True dataOut = data[output_mask].reshape(num_instance, -1) dataIn = data[~output_mask].reshape(num_instance, -1) assert len(dataIn) > 0 assert len(dataOut) > 0 pvals = testRcoT(dataOut, dataIn) pvals[pvals > alpha] = 0 clusters = np.zeros(dataOut.shape[1]) for i, c in enumerate(connected_components(from_numpy_matrix(pvals))): clusters[list(c)] = i + 1 return split_conditional_data_by_clusters(local_data, clusters, scope, rows=False)
def find_mentions(entities): """ Find unique entities and their mentions Args: entities: (dic) a struct for each entity Returns: (dic) unique entities based on their grounded ID, if -1 ID=UNK:No """ equivalents = [] for e in entities: if e.kb_id not in equivalents: equivalents.append(e.kb_id) # mention-level data sets g = to_graph(equivalents) cc = connected_components(g) unique_entities = OrderedDict() unk_id = 0 for c in cc: if tuple(c)[0] == '-1': continue unique_entities[tuple(c)] = [] # consider non-grounded entities as separate entities for e in entities: if e.kb_id[0] == '-1': unique_entities[tuple(('UNK:' + str(unk_id),))] = [e] unk_id += 1 else: for ue in unique_entities.keys(): if list(set(e.kb_id).intersection(set(ue))): unique_entities[ue] += [e] return unique_entities
def idx_cleanboxes(boxes, scores, second_cutoff=0.83): ''' boxes: 2d npy containing all boxes in the image socres: 1d npy containing score for each box in boxes ''' df = pd.DataFrame(boxes) df.columns = ['x0', 'y0', 'x1', 'y1'] df['score'] = scores clusters = [] for box1 in df.itertuples(): cluster = {box1.Index} df2 = df.iloc[box1.Index + 1:, :] for box2 in df2.itertuples(): if _area(box1, box2): cluster.add(box2.Index) clusters.append(cluster) G = _to_graph(clusters) final_index = [] for c in connected_components(G): c = list(c) scores = df.loc[c, 'score'] cdas = scores[scores >= second_cutoff] cdas_idx = cdas.index.to_list() if len(cdas_idx) > 0: final_index.extend(cdas_idx) else: final_index.append(scores.idxmax()) return final_index
def getIndependentRDCGroups_py(data_slice, threshold, k=None, s=1. / 6., non_linearity=numpy.sin, n_jobs=1, rand_gen=None): rdc_adjacency_matrix = rdc_test(data_slice, k=k, s=s, non_linearity=non_linearity, n_jobs=n_jobs, rand_gen=rand_gen) n_features = len(data_slice.cols) # # thresholding rdc_adjacency_matrix[rdc_adjacency_matrix < threshold] = 0 #print("thresholding", rdc_adjacency_matrix) # # getting connected components result = numpy.zeros(n_features) for i, c in enumerate(connected_components(from_numpy_matrix(rdc_adjacency_matrix))): result[list(c)] = i + 1 return result
def compute_overlapping_pairs( candidate_matching_pairs): #A connected component problem. G = nx.Graph() G.add_edges_from( candidate_matching_pairs) #nodes are specific article files overlapping_matching_articles = connected_components(G) #print list(overlapping_matching_articles) return list(overlapping_matching_articles)
def pairwise_connectivity(G): components = connected.connected_components(G) result = 0 for component in components: n = len(component) result += (n * (n - 1)) // 2 return result
def assign_clusters(): ''' This function does the actual donor assignment, assigning unique donor IDs in the contribution table based on clusters of contributions that appear to have the same donor. It works pretty much the same as the mark_matches function above. ''' # Again, instantiate and train our classifier clf = RandomForestClassifier(n_estimators=10, random_state=0) clf = clf.fit([eval(t.features) for t in TRAINING_DATA], [int(t.same) for t in TRAINING_DATA]) # Loop through the last name groups print 'Processing groups ...' for g in Contribution.objects.all().values('group_id').distinct(): if not g['group_id']: continue toupdate = [] G = nx.Graph( ) # Create an empty network graph for each last name group # We're using a simple hash function to help generate unique donor IDs nameid = hashlib.sha224(str(g['group_id'])).hexdigest() # For each match in a last name group for m in Match.objects.filter(c1__group_id=g['group_id']): # Do the two contributions have the same donor? Same as above. edge = clf.predict_proba(eval(m.features)) if edge[0][1] > edge[0][0]: # If they do, add an edge between those contributions in the network graph we created # a few steps ago. This process is outlined in the steps here: # https://github.com/cjdd3b/fec-standardizer/wiki/Matching-donors G.add_edge(m.c1, m.c2) # Now we want to go through the graph we created and basically find all the contributions that are # connected. If the contributions were connected in the step above, that means they're probably from # the same donor. So a donor is basically defined by small networks of connected contributions. This # is described further here: https://github.com/cjdd3b/fec-standardizer/wiki/Defining-donor-clusters ccs = connected_components(G) # Now loop through each of the donor clusters generated by the connected_components function for c in enumerate(ccs): donor_id = c[0] for i in c[1]: # Create a donor ID based on our group hash above and the enumerated cluster number classifier_id = '%s%s' % (donor_id, nameid) i.classifier_id = classifier_id[:12] toupdate.append(i) # Bulk save the donor IDs to the contribution table commit_saves(toupdate) print 'Cleaning up the leftovers ...' tocleanup = [] for record in Contribution.objects.filter(classifier_id__isnull=True): if not record.match_repr: continue classifier_id = '99%s' % hashlib.sha224(record.match_repr).hexdigest() record.classifier_id = classifier_id[:12] tocleanup.append(record) commit_saves(tocleanup) return
def to_graph(l): ''' Including networkx connected components alogorithm ''' G = networkx.Graph() for part in l: # each sublist is a bunch of nodes G.add_nodes_from(part) # it also implies a number of edges: G.add_edges_from(to_edges(part)) return list(connected_components(G))
def get_unidirectional_scheme(df, threshold=5): new_arr = copy.copy(df.values) new_arr = np.where(new_arr > threshold, new_arr, 0) class_names = list(df.index) graph = nx.from_numpy_matrix(new_arr, create_using=nx.Graph) unidirectional_chains = [] for elem in connected_components(graph): if len(elem) != 1: unidirectional_chains.append(list(map(lambda x: class_names[x], elem))) return get_reorg_dict(unidirectional_chains)
def one_shot_agglomeration(self, threshold=0.5): g = self.copy() if len(g.merge_queue) == 0: g.rebuild_merge_queue() for u, v, d in g.edges(data=True): if g.boundary_body in [u,v] or d['weight'] > threshold: g.remove_edge(u, v) ccs = connected_components(g) for cc in ccs: g.merge_subgraph(cc) return g.get_segmentation()
def one_shot_agglomeration(self, threshold=0.5): g = self.copy() if len(g.merge_queue) == 0: g.rebuild_merge_queue() for u, v, d in g.edges(data=True): if g.boundary_body in [u, v] or d['weight'] > threshold: g.remove_edge(u, v) ccs = connected_components(g) for cc in ccs: g.merge_subgraph(cc) return g.get_segmentation()
def assign_clusters(): ''' This function does the actual donor assignment, assigning unique donor IDs in the contribution table based on clusters of contributions that appear to have the same donor. It works pretty much the same as the mark_matches function above. ''' # Again, instantiate and train our classifier clf = RandomForestClassifier(n_estimators=10, random_state=0) clf = clf.fit([eval(t.features) for t in TRAINING_DATA], [int(t.same) for t in TRAINING_DATA]) # Loop through the last name groups print 'Processing groups ...' for g in Contribution.objects.all().values('group_id').distinct(): if not g['group_id']: continue toupdate = [] G = nx.Graph() # Create an empty network graph for each last name group # We're using a simple hash function to help generate unique donor IDs nameid = hashlib.sha224(str(g['group_id'])).hexdigest() # For each match in a last name group for m in Match.objects.filter(c1__group_id=g['group_id']): # Do the two contributions have the same donor? Same as above. edge = clf.predict_proba(eval(m.features)) if edge[0][1] > edge[0][0]: # If they do, add an edge between those contributions in the network graph we created # a few steps ago. This process is outlined in the steps here: # https://github.com/cjdd3b/fec-standardizer/wiki/Matching-donors G.add_edge(m.c1, m.c2) # Now we want to go through the graph we created and basically find all the contributions that are # connected. If the contributions were connected in the step above, that means they're probably from # the same donor. So a donor is basically defined by small networks of connected contributions. This # is described further here: https://github.com/cjdd3b/fec-standardizer/wiki/Defining-donor-clusters ccs = connected_components(G) # Now loop through each of the donor clusters generated by the connected_components function for c in enumerate(ccs): donor_id = c[0] for i in c[1]: # Create a donor ID based on our group hash above and the enumerated cluster number classifier_id = '%s%s' % (donor_id, nameid) i.classifier_id = classifier_id[:12] toupdate.append(i) # Bulk save the donor IDs to the contribution table commit_saves(toupdate) print 'Cleaning up the leftovers ...' tocleanup = [] for record in Contribution.objects.filter(classifier_id__isnull=True): if not record.match_repr: continue classifier_id = '99%s' % hashlib.sha224(record.match_repr).hexdigest() record.classifier_id = classifier_id[:12] tocleanup.append(record) commit_saves(tocleanup) return
def get_result(self): list_connections = [ set(re.findall(r"[\w']+", line.strip())) for line in self.input_content.split('\n') ] graph = self.to_graph(list_connections) list_groups = list(connected_components(graph)) if not self.get_groups: list_groups = list(subset for subset in list_groups if '0' in subset)[0] return list_groups
def compute_overlapping_pairs(candidate_pairs, published, sources): print("Overlapping pairs") G = nx.Graph() G.add_weighted_edges_from(candidate_pairs) nx.set_node_attributes(G, published, name="published") cc = connected_components(G) selected_pairs = list() for c in cc: selected_pairs.extend(select_most_correct_pairs(c, G, sources)) return selected_pairs
def cluster(dupes, threshold=.5): ''' Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall ''' threshold = 1 - threshold score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)] dupe_graph = networkx.Graph() dupe_graph.add_weighted_edges_from((x[0], x[1], y) for (x, y) in dupes) dupe_sub_graphs = connected_components(dupe_graph) clustering = {} cluster_id = 0 for sub_graph in dupe_sub_graphs: if len(sub_graph) > 2: pair_gen = ((x[0:2], x[2]['weight']) for x in dupe_graph.edges_iter(sub_graph, data=True)) pairs = numpy.fromiter(pair_gen, dtype=score_dtype) (i_to_id, condensed_distances) = condensedDistance(pairs) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') for (i, sub_cluster_id) in enumerate(partition): clustering.setdefault(cluster_id + sub_cluster_id, []).append(i_to_id[i]) cluster_id += max(partition) else: clustering[cluster_id] = sub_graph cluster_id += 1 clusters = [set(l) for l in clustering.values() if len(l) > 1] return clusters
def get_connected_components(frames): """ Take the frames at each level interval and calculate connected components.""" f_maxes = frames.max(axis=(1,2)) # Relabel frames so that object numbers are unique across frames for i in range(1,frames.shape[0]): frames[i,frames[i]>0] += np.cumsum(f_maxes[:-1])[i-1] # Create graph for cells that overlap at different vertical levels. overlap_graph = networkx.Graph() total_objs = frames[-1].max() overlap_graph.add_nodes_from(set(range(1, total_objs))) # Create edges between the objects that overlap vertically. for i in range(frames.shape[0]-1): # Determine the objects in frame i. objects = set(frames[i][frames[i]>0]) # Determine the objects in frame i+1. objects_next = set(frames[i+1][frames[i+1]>0]) for j in range(len(list(objects))): overlap = np.logical_and(frames[i] == list(objects)[j], frames[i+1] > 0) overlap_objs = set((frames[i+1][overlap]).flatten()) # If objects overlap, add edge between object j and first # object from overlap set if bool(overlap_objs): overlap_graph.add_edges_from( [(list(objects)[j], list(overlap_objs)[0])] ) # Add edges between objects in overlap set for k in range(0, len(list(overlap_objs))-1): overlap_graph.add_edges_from( [(list(overlap_objs)[k], list(overlap_objs)[k+1])] ) # Create new objects based on connected components new_objs = list(connected_components(overlap_graph)) frames_con = np.zeros(frames.shape, dtype=int) for i in range(len(new_objs)): frames_con[np.isin(frames, list(new_objs[i]))] = i + 1 # Require that objects be present in all vertical level intervals new_objs = list(set(frames_con[frames_con>0].flatten())) object_counter = 1 for i in range(len(new_objs)): if np.all(np.any(frames_con == new_objs[i], axis=(1,2))): frames_con[frames_con == new_objs[i]] = object_counter object_counter += 1 else: frames_con[frames_con == new_objs[i]] = 0 return frames_con, frames
def cluster(dupes, threshold=.5): """ Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall """ threshold = 1 - threshold score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)] dupe_graph = networkx.Graph() dupe_graph.add_weighted_edges_from(((x[0], x[1], y) for x, y in dupes)) del dupes dupe_sub_graphs = connected_components(dupe_graph) clustering = {} cluster_id = 0 for sub_graph in dupe_sub_graphs : if len(sub_graph) > 2 : pair_gen = ((x[0:2], x[2]['weight']) for x in dupe_graph.edges_iter(sub_graph, data=True)) pairs = numpy.fromiter(pair_gen, dtype=score_dtype) (i_to_id, condensed_distances) = condensedDistance(pairs) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') for (i, sub_cluster_id) in enumerate(partition): clustering.setdefault(cluster_id + sub_cluster_id, []).append(i_to_id[i]) cluster_id += max(partition) else : clustering[cluster_id] = sub_graph cluster_id += 1 clusters = [set(l) for l in clustering.values() if len(l) > 1] return clusters
def init(self): self.chokepoints() ccs = list(connected_components(self.sbmlprocessor.graph.to_undirected())) cp = sorted(ccs, key=lambda x: len(x))[-1] centrality_json = self.work_dir + "/centrality.json" if os.path.exists(centrality_json): self.centrality = json.load(open(centrality_json)) else: self.centrality = {x: 0 for x in self.sbmlprocessor.graph.nodes()} cc = sorted(nx.connected_components(self.sbmlprocessor.graph.to_undirected()), key=lambda x: len(x))[-1] g2 = self.sbmlprocessor.graph.to_undirected().subgraph(cc) self.centrality = {x: (y if x in cp else 0) for x, y in betweenness_centrality(g2).items()} json.dump(self.centrality, open(centrality_json, "w"))
def connectedComponents3(self): """Finds connected components of the graph (the molecules). @rtype: list of instances of ChemicalGraph @return: List of connected components. """ result = [] comp_conexos = connected_components(self) quedan = self.nodes() edges = self.edges() for comp in comp_conexos: result_graph = ChemicalGraph() result_graph.add_nodes(comp) for atom in result_graph.nodes(): result_graph.add_node_attribute(atom, self.getAtomAttributes(atom)) for index in edges: if (index[0] in pre_ordering) and ( index[1] in pre_ordering ) and not (index in result_graph.edges()): result_graph.add_edge(index) #print result_graph # copy corresponding angles and dihedrars for atom in result_graph.nodes(): for angle in result_graph._angles: try: pos = angle.index(atom) result_graph._angles.append(angle) break except ValueError: pass for dihedral in result_graph._dihedrals: try: pos = dihedral.index(atom) result_graph._dihedrals.append(dihedral) break except ValueError: pass result.append(result_graph) for v in pre_ordering: quedan.remove(v) return result
def getConnectedComponents(cls, groupNodes): """Get connected components from group of nodes. Parameters ---------- groupNodes : list list of list of points Returns ------- list list of connected components """ G = cls.to_graph(groupNodes) return list(connected_components(G))
def test_message_number_graph(self, mock_rec_list_splice, mock_correctLastCharCR, mock_get_nick_sen_rec, mock_get_year_month_day,\ mock_get_nick_representative, mock_check_if_msg_line, mock_create_connected_nick_list, mock_to_graph): to_graph_ret = util.load_from_disk( current_directory + "/data/message_number_graph/to_graph") conn_list = list(connected_components(to_graph_ret)) mock_to_graph.return_value = to_graph_ret mock_rec_list_splice.side_effect = util.load_from_disk( current_directory + "/data/message_number_graph/rec_list_splice") mock_create_connected_nick_list.return_value = util.load_from_disk( current_directory + "/data/message_number_graph/conn_comp_list") #mock_correct_last_char_list.side_effect = util.load_from_disk(current_directory + "/data/message_number_graph/correct_last_char_list") mock_check_if_msg_line.side_effect = util.load_from_disk( current_directory + "/data/message_number_graph/check_if_msg_line") mock_correctLastCharCR.side_effect = util.load_from_disk( current_directory + "/data/message_number_graph/correctLastCharCR") mock_get_nick_sen_rec.side_effect = util.load_from_disk( current_directory + "/data/message_number_graph/get_nick_sen_rec") #mock_extend_conversation_list.side_effect = util.load_from_disk(current_directory + "/data/message_number_graph/extend_conversation_list") mock_get_nick_representative.side_effect = util.load_from_disk( current_directory + "/data/message_number_graph/get_nick_representative") mock_get_year_month_day.side_effect = util.load_from_disk( current_directory + "/data/message_number_graph/get_year_month_day") capturedOutput = StringIO.StringIO() sys.stdout = capturedOutput ret = network.message_number_graph(self.log_data, self.nicks, self.nick_same_list, DAY_BY_DAY_ANALYSIS=False) sys.stdout = sys.__stdout__ capturedOutput.close() mock_to_graph.assert_called_once_with(self.nick_same_list) mock_create_connected_nick_list.assert_called_once_with(conn_list) self.assertTrue( nx.is_isomorphic( ret, util.load_from_disk( current_directory + "/data/message_number_graph/aggregate_message_number_graph" )))
def mif_preprocess_1(g: MultiGraph, f: set, active_v, k: int) -> set: if nxc.number_connected_components(g) >= 2: mif_set = set() for component in nxc.connected_components(g): f_i = component.intersection(f) gx = g.subgraph(component) component_mif_set = mif_preprocess_2(gx, f_i, active_v, None) if component_mif_set: mif_set = mif_set.union(component_mif_set) if k != None: k -= (len(component_mif_set) - len(f_i)) if k <= 0: return mif_set if k == None or len(mif_set) >= k: return mif_set return None return mif_preprocess_2(g, f, active_v, k)
def test_message_number_graph_day_analysis(self, mock_get_nick_sen_rec, mock_rec_list_splice, mock_correctLastCharCR, mock_check_if_msg_line, mock_create_connected_nick_list, mock_to_graph): to_graph_ret = util.load_from_disk( self.current_directory + "/data/message_number_graph/to_graph") conn_list = list(connected_components(to_graph_ret)) mock_to_graph.return_value = to_graph_ret mock_rec_list_splice.side_effect = util.load_from_disk( self.current_directory + "/data/message_number_graph/rec_list_splice") mock_create_connected_nick_list.return_value = util.load_from_disk( self.current_directory + "/data/message_number_graph/conn_comp_list") mock_check_if_msg_line.side_effect = util.load_from_disk( self.current_directory + "/data/message_number_graph/check_if_msg_line") mock_correctLastCharCR.side_effect = util.load_from_disk( self.current_directory + "/data/message_number_graph/correctLastCharCR") mock_get_nick_sen_rec.side_effect = util.load_from_disk( self.current_directory + "/data/message_number_graph/get_nick_sen_rec") capturedOutput = StringIO.StringIO() sys.stdout = capturedOutput ret = network.message_number_graph(self.log_data, self.nicks, self.nick_same_list, DAY_BY_DAY_ANALYSIS=True) expected_graph_list = util.load_from_disk( self.current_directory + "/data/message_number_graph/message_number_day_list") sys.stdout = sys.__stdout__ capturedOutput.close() mock_to_graph.assert_called_once_with(self.nick_same_list) mock_create_connected_nick_list.assert_called_once_with(conn_list) self.assertTrue(nx.is_isomorphic(ret[0][0], expected_graph_list[0][0])) self.assertTrue(nx.is_isomorphic(ret[1][0], expected_graph_list[1][0]))
def hierarchical_cluster(clusters, threshold): threshold = 1 - threshold score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)] # lclassifier.predict_proba(distances)[0][1] > threshold dupe_graph = networkx.Graph() dupe_graph.add_weighted_edges_from((x[0], x[1], y) for (x, y) in clusters) dupe_sub_graphs = connected_components(dupe_graph) clustering = {} cluster_scores = {} cluster_id = 0 for sub_graph in dupe_sub_graphs: if len(sub_graph) > 2: pair_gen = ((sorted(x[0:2]), x[2]['weight']) for x in dupe_graph.edges_iter(sub_graph, data=True)) pairs = np.fromiter(pair_gen, dtype=score_dtype) pairlist = list(pairs) (i_to_id, condensed_distances) = condensedDistance(pairs) linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') for (i, sub_cluster_id) in enumerate(partition): clustering.setdefault(cluster_id + sub_cluster_id, []).append(i_to_id[i]) cluster_id += max(partition) elif len(sub_graph) == 2: clustering[cluster_id] = sub_graph cluster_id += 1 clusters = [set(l) for l in clustering.values() if len(l) >= 2] return (clusters, cluster_scores)
def main(): """ Reads arguments, finds matches from the inputs, shows matches as groups and plots a graph """ parser = argparse.ArgumentParser( description='This program prints out groups of words that have a certain' \ ' Levenshtein edit distance') parser.add_argument( '--ratio', dest='min_match_ratio', choices=[str(i) for i in range(0, 101)], help='Number that determines the Levenshtein edit distance, should be between 0 and 100' ) parser.add_argument( '--files', dest='files', default='', nargs='+', help='The files that contains new line separated words' ) parser.add_argument( '--version', action='version', version='%(prog)s 1.0alpha' ) opts = parser.parse_args() min_match_ratio = int(opts.min_match_ratio or 80) words = read_files_into_list(opts.files) # the following converts the list to a graph that will merge lists if they have shared items matches = find_matches(words, min_match_ratio) graph = to_graph(matches) draw_cluster(graph, networkx.spring_layout(graph)) for group in list(connected_components(graph)): print(group) plot.show() input('Press enter to continue...')
def group_alignments(alignments, contig_names): contig_groupings = [] for alignment in alignments: if alignment.qseqid in contig_names: contig_names.remove(alignment.qseqid) if alignment.sseqid in contig_names: contig_names.remove(alignment.sseqid) contig_groupings.append([alignment.qseqid, alignment.sseqid]) G = to_graph(contig_groupings) groupings = {} i = 0 for group in connected_components(G): groupings[i] = group i += 1 if len(groupings.keys()) != 0: max_group = max(groupings.keys()) else: max_group = 0 for i, contig in enumerate(contig_names): groupings[i + max_group + 1] = [contig] return groupings
def merge_boxes_in_results(results_dict, min_conf_threshold, iou_threshold): final_results = Results() # Clean dict to remove min_conf_threshold for _, regions in results_dict.items(): to_remove = [] for r in regions: if r.conf < min_conf_threshold: to_remove.append(r) for r in to_remove: regions.remove(r) for fid, regions in results_dict.items(): overlap_pairwise_list = pairwise_overlap_indexing_list( regions, iou_threshold) overlap_graph = to_graph(overlap_pairwise_list) grouped_bbox_idx = [c for c in sorted( connected_components(overlap_graph), key=len, reverse=True)] merged_regions = simple_merge(regions, grouped_bbox_idx) for r in merged_regions: final_results.append(r) return final_results
def getIndependentGDTGroups_py(data_slice, threshold, # n_jobs=1, rand_gen=None): gdt_adjacency_matrix = pairwise_gdt(data_slice, ) n_features = len(data_slice.cols) # # thresholding gdt_adjacency_matrix[gdt_adjacency_matrix < threshold] = 0 #print("thresholding", gdt_adjacency_matrix) # # getting connected components result = numpy.zeros(n_features) for i, c in enumerate(connected_components(from_numpy_matrix(gdt_adjacency_matrix))): result[list(c)] = i + 1 return result
def test_message_time_graph(self, mock_get_nick_sen_rec, mock_correct_last_char_list, \ mock_rec_list_splice, mock_check_if_msg_line, mock_create_connected_nick_list, mock_to_graph): to_graph_ret = util.load_from_disk(self.test_data_dir + "message_time_graph/to_graph") conn_list = list(connected_components(to_graph_ret)) mock_to_graph.return_value = to_graph_ret mock_rec_list_splice.side_effect = util.load_from_disk( self.test_data_dir + "message_time_graph/rec_list_splice") mock_create_connected_nick_list.return_value = util.load_from_disk( self.test_data_dir + "message_time_graph/conn_comp_list") mock_check_if_msg_line.side_effect = util.load_from_disk( self.test_data_dir + "message_time_graph/check_if_msg_line") mock_get_nick_sen_rec.side_effect = util.load_from_disk( self.test_data_dir + "message_time_graph/get_nick_sen_rec") mock_correct_last_char_list.side_effect = util.load_from_disk( self.test_data_dir + "message_time_graph/correct_last_char_list") capturedOutput = StringIO.StringIO() sys.stdout = capturedOutput graph = network.message_time_graph(self.log_data, self.nicks, self.nick_same_list, DAY_BY_DAY_ANALYSIS=False) sys.stdout = sys.__stdout__ capturedOutput.close() mock_to_graph.assert_called_once_with(self.nick_same_list) mock_create_connected_nick_list.assert_called_once_with(conn_list) self.assertTrue( nx.is_isomorphic( graph, util.load_from_disk(self.test_data_dir + "message_time_graph/msg_time_aggr_graph")))
def getIndependentGroupsStabilityTest(data, alpha=0.001): #data = numpy.loadtxt("/Users/alejomc/Dropbox/pspn/spyn/experiments/graphclassification/wl/1mutag.build_wl_corpus.csv", dtype=int, delimiter=",") #df = pandas.read_csv('/Users/alejomc/Dropbox/pspn/spyn/experiments/graphclassification/wl/1mutag.build_wl_corpus.csv') #df = pandas.read_csv('/Users/alejomc/Dropbox/pspn/spyn/experiments/graphclassification/wl/5nci1.build_wl_corpus.csv') df = DataFrame(data, columns=["V" + str(i) for i in range(1, data.shape[1] + 1)]) #pvals = bonferroniCorrection(computeEstabilityTest(df, 0)) #compute stability test with Pool() as pool: pvals = pool.starmap(computePvals, zip(repeat(df), range(df.shape[1]))) #print(pvals) pvals = numpy.asarray(pvals) #print(pvals[0,:]) #convert graph to undirected graph #print("AM SHAPE ",pvals.shape) for i, j in zip(*numpy.tril_indices(pvals.shape[1])): pvals[i, j] = pvals[j, i] = min(pvals[i, j], pvals[j, i]) pvals[numpy.diag_indices_from(pvals)] = 1 #print(pvals) pvals[pvals > alpha] = 0 result = numpy.zeros(df.shape[1]) for i, c in enumerate(connected_components(from_numpy_matrix(pvals))): result[list(c)] = i + 1 return result
def test_message_number_graph(self): to_graph_ret = util.load_from_disk(self.test_data_dir + "message_number_graph/to_graph") conn_list = list(connected_components(to_graph_ret)) capturedOutput = StringIO.StringIO() sys.stdout = capturedOutput ret = network.message_number_graph(self.log_data, self.nicks, self.nick_same_list, DAY_BY_DAY_ANALYSIS=False) sys.stdout = sys.__stdout__ capturedOutput.close() self.assertTrue( nx.is_isomorphic( ret, util.load_from_disk( self.test_data_dir + "message_number_graph/aggregate_message_number_graph")))
def group_alignments(alignments, fosmids_to_ignore, fosmid_names): fosmid_groupings = [] for alignment in alignments: if alignment.qseqid in fosmids_to_ignore: continue if alignment.sseqid in fosmids_to_ignore: continue if alignment.qseqid in fosmid_names: fosmid_names.remove(alignment.qseqid) if alignment.sseqid in fosmid_names: fosmid_names.remove(alignment.sseqid) fosmid_groupings.append([alignment.qseqid, alignment.sseqid]) G = to_graph(fosmid_groupings) groupings = {} for i, group in enumerate(connected_components(G)): groupings[i] = group if len(groupings.keys()) != 0: max_group = max(groupings.keys()) else: max_group = 0 for i, fosmid in enumerate(fosmid_names): groupings[i + max_group + 1] = [fosmid] return groupings
def mif_preprocess_2(g: MultiGraph, f: set, active_v, k: int) -> set: mif_set = set() while not is_independent_set(g, f): mif_set = mif_set.union(f) for component in nxc.connected_components(g.subgraph(f)): if len(component) > 1: if active_v in component: active_v = component.pop() compressed_node = active_v else: compressed_node = component.pop() g = compress(g, component, compressed_node, True) f = f.intersection(g.nodes()) # Maybe faster with # f = f.difference(component) # f.add(compressed_node) mif_set = mif_set.union(component) break mif_set2 = mif_main(g, f, active_v, k) if mif_set2: mif_set = mif_set2.union(mif_set) if k == None or len(mif_set) >= k: return mif_set return None
def message_time_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False): """ creates a directed graph where each edge denotes a message sent from a user to another user with the stamp denoting the time at which the message was sent Args: log_dict (dictionary): Dictionary of logs data created using reader.py nicks(List) : List of nickname created using nickTracker.py nick_same_list(List) :List of same_nick names created using nickTracker.py Returns: msg_time_graph_list(List): List of message time graphs for different days msg_time_aggr_graph: aggregate message time graph where edges are date + time when sender sends a message to receiver """ msg_time_graph_list = [] msg_time_aggr_graph = nx.MultiDiGraph() G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) def compare_spliced_nick(nick_to_compare, spliced_nick, nick_name, line): if(nick_to_compare == nick_name): if(spliced_nick != nick_name): nick_receiver = nick_receiver_from_conn_comp(nick_name, conn_comp_list) util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph) util.create_connected_nick_list(conn_comp_list) for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] year, month, day = util.get_year_month_day(day_content) graph_conversation = nx.MultiDiGraph() #graph with multiple directed edges between clients used for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): m = re.search(r"\<(.*?)\>", line) spliced_nick = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = "" nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, spliced_nick, conn_comp_list, nick_sender) for nick_name in nicks: rec_list = [e.strip() for e in line.split(':')] #receiver list splited about : util.rec_list_splice(rec_list) if not rec_list[1]: #index 0 will contain time 14:02 break rec_list = util.correct_last_char_list(rec_list) for nick_to_search in rec_list: if(nick_to_search == nick_name): if(spliced_nick != nick_name): nick_receiver = "" nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick_name, conn_comp_list, nick_receiver) util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph) if "," in rec_list[1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[1].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) for nick_to_search in rec_list_2: compare_spliced_nick(nick_to_search, spliced_nick, nick_name, line) if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi! rec = line[line.find(">") + 1:line.find(", ")] rec = util.correctLastCharCR(rec[1:]) compare_spliced_nick(rec, spliced_nick, nick_name, line) msg_time_graph_list.append(graph_conversation) if DAY_BY_DAY_ANALYSIS: return msg_time_graph_list else: return msg_time_aggr_graph
for i, v in enumerate(statics.values()): tempsum += len(v) print "Avg neighbours: {:.2f}, total nodes: {:3d}".format(tempsum*1.0/i, len(statics)) print sorted(statics.keys()) for node, neighbour_list in statics.iteritems(): for nid, num in neighbour_list.iteritems(): G.add_edge(node, nid) pos = nx.graphviz_layout(G) nodelist = G.nodes() print connected_components(G) labels={} pl.figure() for node in nodelist: labels[node] = node nx.draw_networkx_nodes(G, pos, node_size = 120, node_color='r') #nx.draw_networkx_nodes(G, pos, node_size = 200, nodelist=[20,75], node_color='b') nx.draw_networkx_edges(G, pos, alpha=0.5) nx.draw_networkx_labels(G,pos, labels=labels, font_size=12) pl.savefig("connectivity.pdf") pl.show() ################################################ABOVE IS CONNECTIVITY################################
def gcc_ratio(self ): cc_sizes = list() for cc in connected_components( self._network ): cc_sizes.append( len( cc) ) return float( max( cc_sizes ) )/ float( self._network.number_of_nodes() )
def response_time(log_dict, nicks, nick_same_list): """ finds the response time of a message i.e. the best guess for the time at which one can expect a reply for his/her message. Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : List of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py output_directory (str): Location of output directory Returns: rows_RT(zip List): Response Time (This refers to the response time of a message i.e. the best guess for the time at which one can expect a reply for his/her message) """ G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) util.create_connected_nick_list(conn_comp_list) graph_cumulative = [] graph_x_axis = [] graph_y_axis = [] def build_mean_list(conversations, index, mean_list): for j in range(2, len(conversations[index])): mean_list.append(conversations[index][j]) return mean_list def resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list): if(rec == nick): send_time.append(line[1:6]) if(nick_to_search != nick): nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if (nick_sender in conversations[i] and nick_receiver in conversations[i]): conversations[i].append(line[1:6]) break if(len(conversations[i]) == 0): conversations[i].append(nick_sender) conversations[i].append(nick_receiver) conversations[i].append(line[1:6]) break return conversations, nick_receiver, send_time for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] send_time = [] #list of all the times a user sends a message to another user meanstd_list = [] totalmeanstd_list = [] x_axis = [] y_axis = [] real_y_axis = [] conversations = [[] for i in range(config.MAX_RESPONSE_CONVERSATIONS)] #code for making relation map between clients for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) for name in rec_list: conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) if "," in rec_list[1]: flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[1].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) for name in rec_list_2: conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) if(flag_comma == 0): rec = util.splice_find(line, ">", ", ",1) conversations, nick_receiver, send_time = resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): for j in range(2, len(conversations[i]) - 1): conversations[i][j]=(int(conversations[i][j+1][0:2])*config.MINS_PER_HOUR+int(conversations[i][j+1][3:5])) - (int(conversations[i][j][0:2])*config.MINS_PER_HOUR+int(conversations[i][j][3:5])) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): if(len(conversations[i]) == 3): conversations[i][2] = int(conversations[i][2][0:2])*config.MINS_PER_HOUR+int(conversations[i][2][3:5]) else: del conversations[i][-1] #Explanation provided in parser-CL+CRT.py for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): totalmeanstd_list = build_mean_list(conversations, i, totalmeanstd_list) if(len(totalmeanstd_list) != 0): for i in range(max(totalmeanstd_list) + 1): x_axis.append(i) for i in x_axis: y_axis.append(float(totalmeanstd_list.count(i)) / float(len(totalmeanstd_list))) #finding the probability of each RT to occur=No. of occurence/total occurences. real_y_axis.append(y_axis[0]) for i in range(len(y_axis)): real_y_axis.append(float(real_y_axis[i-1]) + float(y_axis[i])) #to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry. for i in range(len(totalmeanstd_list)): graph_cumulative.append(totalmeanstd_list[i]) if len(totalmeanstd_list) > 0: totalmeanstd_list.append(numpy.mean(totalmeanstd_list)) totalmeanstd_list.append(numpy.mean(totalmeanstd_list)+2*numpy.std(totalmeanstd_list)) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): meanstd_list = build_mean_list(conversations, i, meanstd_list) conversations[i].append(numpy.mean(meanstd_list)) conversations[i].append(numpy.mean(meanstd_list)+(2*numpy.std(meanstd_list))) meanstd_list[:] = [] graph_cumulative.sort() for i in range(graph_cumulative[len(graph_cumulative)-1] + 1): graph_y_axis.append(graph_cumulative.count(i)) # problem when ti=0 count is unexpectedly large graph_x_axis.append(i) #Finally storing the RT values along with their frequencies in a csv file. rows_rt = zip(graph_x_axis, graph_y_axis) return rows_rt
def conv_len_conv_refr_time(log_dict, nicks, nick_same_list): """ Calculates the conversation length (CL) that is the length of time for which two users communicate i.e. if a message is not replied to within Response Time(RT), then it is considered as a part of another conversation. This function also calculates the conversation refresh time(CRT) For a pair of users, this is the time when one conversation ends and another one starts. Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns: row_cl(zip List): Conversation Length row_crt(zip List) :Conversation Refresh time """ conv = [] conv_diff = [] G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) util.create_connected_nick_list(conn_comp_list) # We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g. #Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user. conversations=[[] for i in range(config.MAX_CONVERSATIONS)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too. ## I would advice on using a different data structure which does not have an upper bound like we do in arrays. graphx1 =[] graphy1 =[] graphx2 =[] graphy2 =[] dateadd = -1 #Variable used for response time calculation. Varies from 0-365. def build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list): for names in rec_list: conversations, nick_receiver, send_time = conv_helper(names, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) return conversations, nick_receiver, send_time def conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list): if(rec == nick): send_time.append(line[1:6]) if(nick_to_search != nick): nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver) for i in range(config.MAX_CONVERSATIONS): if (nick_sender in conversations[i] and nick_receiver in conversations[i]): conversations = conv_append(conversations, i, dateadd, line) break if(len(conversations[i]) == 0): conversations[i].append(nick_sender) conversations[i].append(nick_receiver) conversations = conv_append(conversations, i, dateadd, line) break return conversations, nick_receiver, send_time def conv_mat_diff(i,j,conversations): """ i(int): matrix index for row j(int): matrix index for column """ return (conversations[i][j]-conversations[i][j-1]) def conv_append(conversations, index, dateadd, line): conversations[index].append(config.HOURS_PER_DAY*config.MINS_PER_HOUR*dateadd + int(line[1:6][0:2])*config.MINS_PER_HOUR + int(line[1:6][3:5])) return conversations for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] dateadd = dateadd + 1 send_time = [] #list of all the times a user sends a message to another user #code for making relation map between clients for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) conversations, nick_receiver, send_time = build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) if "," in rec_list[1]: flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[1].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) conversations, nick_receiver, send_time = build_conversation(rec_list_2, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) if(flag_comma == 0): rec = util.splice_find(line, ">", ", ", 1) conversations, nick_receiver, send_time = conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) #Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year. for i in range(len(conversations)): #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs) if(len(conversations[i]) != 0): # response times are calculated starting from index 2. So now we have all the response times in conversations. del conversations[i][0:2] for i in range(len(conversations)): if(len(conversations[i]) != 0): first = conversations[i][0] for j in range(1, len(conversations[i])): if(conv_mat_diff(i, j, conversations) > 9): conv.append(conversations[i][j-1] - first) #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response #time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code. conv_diff.append(conv_mat_diff(i, j, conversations)) first = conversations[i][j] if(j == (len(conversations[i]) - 1)): conv.append(conversations[i][j] - first) break def build_conv_csv(conv_list, graph_x, graph_y): for i in range(max(conv_list)): graph_x.append(i) graph_y.append(conv_list.count(i)) return graph_x, graph_y graphx1, graphy1 = build_conv_csv(conv, graphx1, graphy1) graphx2, graphy2 = build_conv_csv(conv_diff, graphx2, graphy2) #To plot CDF we store the CL and CRT values and their number of occurences as shown above. row_cl = zip(graphx1, graphy1) row_crt = zip(graphx2, graphy2) return row_cl, row_crt
def findBonds(self, ratio=setting.bond_ratio, **kwargs): del self.segments del self.bond_types self.segments = [] self.bond_types = {} if 'no_report' not in kwargs or not kwargs['no_report']: qtk.report("Molecule", "finding bonds with cutoff ratio", ratio) def to_graph(l): G = networkx.Graph() for part in l: # each sublist is a bunch of nodes G.add_nodes_from(part) # it also imlies a number of edges: G.add_edges_from(to_edges(part)) return G def to_edges(l): """ treat `l` as a Graph and returns it's edges to_edges(['a','b','c','d']) -> [(a,b), (b,c),(c,d)] """ it = iter(l) last = next(it) for current in it: yield last, current last = current itr = 0 bond_list = [] bonded = [False for i in range(self.N)] for i in xrange(self.N): for j in xrange(i+1, self.N): d_ij = np.linalg.norm(self.R[i,:] - self.R[j,:]) atom_i = getattr(pt, self.type_list[i]) atom_j = getattr(pt, self.type_list[j]) Ri = atom_i.covalent_radius + \ atom_i.covalent_radius_uncertainty Rj = atom_j.covalent_radius + \ atom_j.covalent_radius_uncertainty Dij = (Ri+Rj) * float(ratio) if d_ij < Dij: bonded[i] = True bonded[j] = True if self.Z[i] < self.Z[j]: atom_begin = self.Z[i] atom_end = self.Z[j] index_begin = i index_end = j else: atom_begin = self.Z[j] atom_end = self.Z[i] index_begin = j index_end = i self.bonds[itr] = {'atom_begin' : atom_begin, 'index_begin' : index_begin, 'atom_end' : atom_end, 'index_end' : index_end, 'length' : d_ij} bond_list.append([i, j]) type_begin = qtk.Z2n(atom_begin) type_end = qtk.Z2n(atom_end) bond_table = qtk.data.elements.bond_table bond_keys = [] bond_keys = [ type_begin + _ + type_end for _ in ['-', '=', '#'] ] try: bond_type_ind = np.argmin( abs( np.array([ bond_table[k][0] for k in bond_keys if k in bond_table.keys() ]) - d_ij ) ) except Exception as _e: self.write_xyz() qtk.exit( "error while processing bond" +\ str(bond_keys) + "with error message %s" % str(_e)) bond_type = bond_keys[bond_type_ind] self.bonds[itr]['name'] = bond_type bond_energy = \ bond_table[bond_keys[bond_type_ind]][1] * \ qtk.convE(1, 'kj-kcal')[0] self.bonds[itr]['energy'] = bond_energy if np.isnan(bond_energy): qtk.warning("Non-tabliated covalent bond %s" % bond_type) if bond_type in self.bond_types: self.bond_types[bond_type] += 1 else: self.bond_types[bond_type] = 1 itr += 1 segments = list(connected_components(to_graph(bond_list))) for s in range(len(segments)): segment = list(segments[s]) new_mol = self.getSegment(segment, **kwargs) ns = len(self.segments) new_mol.name = new_mol.name + '_%d' % ns self.segments.append(new_mol) for s in [i for i in range(self.N) if not bonded[i]]: segment = [s] new_mol = self.getSegment(segment, **kwargs) ns = len(self.segments) new_mol.name = new_mol.name + '_%d' % ns self.segments.append(new_mol)
#print(x) for ni in nicks: for ind in range(500): if ni in x[ind]: break if not x[ind]: x[ind].append(ni) break #print("*********************x**********************************") #print(x) G = to_graph(x) L = list(connected_components(G)) for i in range(1,len(L)+1): L[i-1] = list(L[i-1]) #print(L) for iterator in range(1,13): for fileiterator in range(1,32): if(fileiterator<10): sttring="/home/dhruvie/LOP/2013/"+str(iterator)+"/0" sttring=sttring+str(fileiterator)+"/#kubuntu-devel.txt"
def implementWithIgraphs(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): nick_same_list=[[] for i in range(5000)] nicks = [] #list of all the nicknames conversations=[[] for i in range(5000)] for i in xrange(0,5000): conversations[i].append(0) for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel print(filePath) send_time = [] #list of all the times a user sends a message to another user nicks_for_the_day = [] channel= "#kubuntu-devel" #channel name #code for getting all the nicknames in a list for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks_for_the_day: nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks_for_the_day)): if nicks_for_the_day[i][1:-1] not in nicks: nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames for i in xrange(0,len(nicks)): if(len(nicks[i])!=0): nicks[i]=correctLastCharCR(nicks[i]) for j in content: if(j[0]=='=' and "changed the topic of" not in j): line1=j[j.find("=")+1:j.find(" is")] line2=j[j.find("wn as")+1:j.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=correctLastCharCR(line1) if(len(line2)!=0): line2=correctLastCharCR(line2) if line1 not in nicks: nicks.append(line1) if line2 not in nicks: nicks.append(line2) #code for forming list of lists for avoiding nickname duplicacy for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")] line2=line[line.find("wn as")+1:line.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=correctLastCharCR(line1) if(len(line2)!=0): line2=correctLastCharCR(line2) for i in range(5000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: if line1 in nick_same_list[i] and line2 not in nick_same_list[i]: nick_same_list[i].append(line2) break if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: nick_same_list[i].append(line1) break if line2 in nick_same_list[i] and line1 in nick_same_list[i]: break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break #print(x) for ni in nicks: for ind in range(5000): if ni in nick_same_list[ind]: break if not nick_same_list[ind]: nick_same_list[ind].append(ni) break #print("*********************x**********************************") #print(nick_same_list) G = to_graph(nick_same_list) L = connected_components(G) for i in range(1,len(L)+1): L[i-1] = [i]+L[i-1] #print(L) #Uptil here we have all the nicks of the same user clustered together. for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name print(filePath) for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var=correctLastCharCR(var) for d in range(len(nicks)): if var in L[d]: nick_sender = L[d][0] break for i in nicks: rec_list=[e.strip() for e in line.split(':')] rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: break for ik in xrange(0,len(rec_list)): if(rec_list[ik]): rec_list[ik]=correctLastCharCR(rec_list[ik]) for z in rec_list: if(z==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if i in L[d]: nick_receiver=L[d][0] break for rt in xrange(0,5000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): if (nick_sender == conversations[rt][1] and nick_receiver == conversations[rt][2]): conversations[rt][0]=conversations[rt][0]+1 break if(len(conversations[rt])==1): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt][0]=conversations[rt][0]+1 break if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for ij in xrange(0,len(rec_list_2)): if(rec_list_2[ij]): rec_list_2[ij]=correctLastCharCR(rec_list_2[ij]) for j in rec_list_2: if(j==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if i in L[d]: nick_receiver=L[d][0] break for rt in xrange(0,5000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): if (nick_sender == conversations[rt][1] and nick_receiver == conversations[rt][2]): conversations[rt][0]=conversations[rt][0]+1 break if(len(conversations[rt])==1): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt][0]=conversations[rt][0]+1 break if(flag_comma == 0): rec=line[line.find(">")+1:line.find(", ")] rec=rec[1:] rec=correctLastCharCR(rec) if(rec==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if i in L[d]: nick_receiver=L[d][0] break for rt in xrange(0,5000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): if (nick_sender == conversations[rt][1] and nick_receiver == conversations[rt][2]): conversations[rt][0]=conversations[rt][0]+1 break if(len(conversations[rt])==1): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt][0]=conversations[rt][0]+1 break G = Graph(directed=True) #graph with multiple directed edges between clients used #Notice how the syntax changes with python-igraphs as compared to networkx. vertex1=[] edge1=[] for fin in xrange(0,5000): if(len(conversations[fin])==3): if(str(conversations[fin][1]) not in vertex1): G.add_vertex(str(conversations[fin][1])) vertex1.append(str(conversations[fin][1])) if(str(conversations[fin][2]) not in vertex1): G.add_vertex(str(conversations[fin][2])) vertex1.append(str(conversations[fin][2])) #vertex1 contains the vertex names. edge1.append(conversations[fin][0]) #edge1 contains the edge weights G.add_edge(str(conversations[fin][1]),str(conversations[fin][2])) G.vs['name'] = vertex1 G.es['label'] = edge1 #Here we add all the labels like color,name,id,weights etc that we want in our graph G.es['weight'] = edge1 G.vs['id'] = G.vs['name'] G.es['width'] = edge1 #print(vertex1) #print(conversations) #G.write_adjacency("adja_wholeyear.csv",sep=',') #Igraphs has a simple function for printing the adjacency matrix of a graph to a csv file. G.write_pajek("checkpajek.net") #writes a graph in pajek format. plot(G, "checkgraph.png",edge_width=rescale(edge1,out_range=(1, 15)),layout = G.layout_fruchterman_reingold(), edge_arrow_size=0.5, vertex_size=8)
def message_number_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False): """ Creates a directed graph with each node representing an IRC user and each directed edge has a weight which mentions the number messages sent and recieved by that user in the selected time frame. Args: log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name} nicks(list): list of all the nicks nick_same_list(list): list of lists mentioning nicks which belong to same users Returns: message_number_graph (nx graph object) """ message_number_day_list = [] conversations=[[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)] aggregate_message_number_graph = nx.DiGraph() #graph with multiple directed edges between clients used G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) util.create_connected_nick_list(conn_comp_list) def msg_no_analysis_helper(rec_list, corrected_nick, nick, conn_comp_list,conversations,today_conversation): for receiver in rec_list: if(receiver == nick): if(corrected_nick != nick): nick_receiver = '' nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick, conn_comp_list, nick_receiver) if DAY_BY_DAY_ANALYSIS: today_conversation = util.extend_conversation_list(nick_sender, nick_receiver, today_conversation) else: conversations = util.extend_conversation_list(nick_sender, nick_receiver, conversations) def message_no_add_egde(message_graph, conversation): for index in xrange(config.MAX_EXPECTED_DIFF_NICKS): if(len(conversation[index]) == 3 and conversation[index][0] >= config.THRESHOLD_MESSAGE_NUMBER_GRAPH): if len(conversation[index][1]) >= config.MINIMUM_NICK_LENGTH and len(conversation[index][2]) >= config.MINIMUM_NICK_LENGTH: message_graph.add_edge(conversation[index][1], conversation[index][2], weight=conversation[index][0]) return message_graph for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] today_conversation = [[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)] for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): parsed_nick = re.search(r"\<(.*?)\>", line) corrected_nick = util.correctLastCharCR(parsed_nick.group(0)[1:-1]) nick_sender = "" nick_receiver = "" nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, corrected_nick, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) msg_no_analysis_helper(rec_list, corrected_nick, nick, conn_comp_list, conversations,today_conversation) if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for i in xrange(0,len(rec_list_2)): if(rec_list_2[i]): rec_list_2[i] = util.correctLastCharCR(rec_list_2[i]) msg_no_analysis_helper(rec_list_2, corrected_nick, nick, conn_comp_list, conversations, today_conversation) if(flag_comma == 0): rec = line[line.find(">")+1:line.find(", ")] rec = rec[1:] rec = util.correctLastCharCR(rec) if(rec == nick): if(corrected_nick != nick): nick_receiver = nick_receiver_from_conn_comp(nick, conn_comp_list) if DAY_BY_DAY_ANALYSIS: today_message_number_graph = nx.DiGraph() today_message_number_graph = message_no_add_egde(today_message_number_graph, today_conversation) year, month, day = util.get_year_month_day(day_content) message_number_day_list.append([today_message_number_graph, year+'-'+month+'-'+day]) print "\nBuilding graph object with EDGE WEIGHT THRESHOLD:", config.THRESHOLD_MESSAGE_NUMBER_GRAPH if not DAY_BY_DAY_ANALYSIS: aggregate_message_number_graph = message_no_add_egde(aggregate_message_number_graph, conversations) if config.DEBUGGER: print "========> 30 on " + str(len(conversations)) + " conversations" print conversations[:30] if DAY_BY_DAY_ANALYSIS: return message_number_day_list else: return aggregate_message_number_graph
def findResponseTime(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): nick_same_list=[[] for i in range(7000)] nicks = [] #list of all the nicknames conv = [] conv_diff = [] # out_dir_msg_num = output_directory+"RT/" out_dir_msg_num = output_directory if not os.path.exists(os.path.dirname(out_dir_msg_num)): try: os.makedirs(os.path.dirname(out_dir_msg_num)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name send_time = [] #list of all the times a user sends a message to another user nicks_for_the_day = [] print(filePath) #code for getting all the nicknames in a list for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks_for_the_day: nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks_for_the_day)): if nicks_for_the_day[i][1:-1] not in nicks: nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames for i in xrange(0,len(nicks)): if(len(nicks[i])!=0): nicks[i]=correctLastCharCR(nicks[i]) for j in content: if(j[0]=='=' and "changed the topic of" not in j): line1=j[j.find("=")+1:j.find(" is")] line2=j[j.find("wn as")+1:j.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=correctLastCharCR(line1) if(len(line2)!=0): line2=correctLastCharCR(line2) if line1 not in nicks: nicks.append(line1) if line2 not in nicks: nicks.append(line2) #code for forming list of lists for avoiding nickname duplicacy for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")] line2=line[line.find("wn as")+1:line.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=correctLastCharCR(line1) if(len(line2)!=0): line2=correctLastCharCR(line2) for i in range(7000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: if line1 in nick_same_list[i] and line2 not in nick_same_list[i]: nick_same_list[i].append(line2) break if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: nick_same_list[i].append(line1) break if line2 in nick_same_list[i] and line1 in nick_same_list[i]: break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break for ni in nicks: for ind in range(7000): if ni in nick_same_list[ind]: break if not nick_same_list[ind]: nick_same_list[ind].append(ni) break G = to_graph(nick_same_list) L = connected_components(G) for i in range(1,len(L)+1): L[i-1] = [i]+L[i-1] graph_to_sir = [] graph_x_axis = [] graph_y_axis = [] graphx1 =[] graphy1 =[] graphx2 =[] graphy2 =[] #2,3 dateadd=-1 for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name dateadd=dateadd+1 send_time = [] #list of all the times a user sends a message to another user meanstd_list = [] totalmeanstd_list = [] x_axis = [] y_axis = [] real_y_axis = [] time_in_min = [[] for i in range(1000)] print(filePath) conversations=[[] for i in range(200)] #code for making relation map between clients for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var=correctLastCharCR(var) for d in range(len(nicks)): if((d < len(L)) and (var in L[d])): nick_sender = L[d][0] break for i in nicks: rec_list=[e.strip() for e in line.split(':')] rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: break for ik in xrange(0,len(rec_list)): if(rec_list[ik]): rec_list[ik]=correctLastCharCR(rec_list[ik]) for z in rec_list: if(z==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for rt in xrange(0,200): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(line[1:6]) break if(len(conversations[rt])==0): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(line[1:6]) break if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for ij in xrange(0,len(rec_list_2)): if(rec_list_2[ij]): rec_list_2[ij]=correctLastCharCR(rec_list_2[ij]) for j in rec_list_2: if(j==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for rt in xrange(0,200): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(line[1:6]) break if(len(conversations[rt])==0): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(line[1:6]) break if(flag_comma == 0): rec=line[line.find(">")+1:line.find(", ")] rec=rec[1:] rec=correctLastCharCR(rec) if(rec==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if ((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for rt in xrange(0,200): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(line[1:6]) break if(len(conversations[rt])==0): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(line[1:6]) break for index in range(0,200): if(len(conversations[index])!=0): for index1 in range(2,len(conversations[index])-1): conversations[index][index1]=(int(conversations[index][index1+1][0:2])*60+int(conversations[index][index1+1][3:5])) - (int(conversations[index][index1][0:2])*60+int(conversations[index][index1][3:5])) for index in range(0,200): if(len(conversations[index])!=0): if(len(conversations[index])==3): conversations[index][2] = int(conversations[index][2][0:2])*60+int(conversations[index][2][3:5]) else: del conversations[index][-1] #Explanation provided in parser-CL+CRT.py for index in range(0,200): if(len(conversations[index])!=0): for index1 in range(2,len(conversations[index])): totalmeanstd_list.append(conversations[index][index1]) if(len(totalmeanstd_list)!=0): for iy in range(0, max(totalmeanstd_list)+1): x_axis.append(iy) for ui in x_axis: y_axis.append(float(totalmeanstd_list.count(ui))/float(len(totalmeanstd_list))) #finding the probability of each RT to occur=No. of occurence/total occurences. real_y_axis.append(y_axis[0]) for ix in range(1, len(y_axis)): real_y_axis.append(float(real_y_axis[ix-1])+float(y_axis[ix])) #to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry. for hi in range(0,len(totalmeanstd_list)): graph_to_sir.append(totalmeanstd_list[hi]) totalmeanstd_list.append(numpy.mean(totalmeanstd_list)) totalmeanstd_list.append(numpy.mean(totalmeanstd_list)+2*numpy.std(totalmeanstd_list)) for index in range(0,200): if(len(conversations[index])!=0): for index1 in range(2,len(conversations[index])): meanstd_list.append(conversations[index][index1]) conversations[index].append(numpy.mean(meanstd_list)) conversations[index].append(numpy.mean(meanstd_list)+(2*numpy.std(meanstd_list))) meanstd_list[:] = [] #print("Conversation RT Info") #print(conversations) #print("Total Response-Time") #print(totalmeanstd_list) #print("\n\n") #print("grpahs to graph_to_sir") #print(graph_to_sir) graph_to_sir.sort() #print(graph_to_sir) for ti in range(0,graph_to_sir[len(graph_to_sir)-1]+1): graph_y_axis.append(graph_to_sir.count(ti)) graph_x_axis.append(ti) # print(graph_y_axis) #print(graph_x_axis) #print(len(graph_y_axis)) #print(len(graph_x_axis)) #Finally storing the RT values along with their frequencies in a csv file. rows = zip(graph_x_axis,graph_y_axis) filename=out_dir_msg_num+channel_name+"_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+"_RT.csv" with open(filename, 'a+') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for row in rows: wr.writerow(row)
def createAggregateGraph(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): MAX_EXPECTED_DIFF_NICKS = 5000 nick_same_list=[[] for i in range(MAX_EXPECTED_DIFF_NICKS)] conversations=[[] for i in range(MAX_EXPECTED_DIFF_NICKS)] for i in xrange(0,MAX_EXPECTED_DIFF_NICKS): conversations[i].append(0) nicks = [] #list of all the nicknames aggregate_graph = nx.DiGraph() #graph with multiple directed edges between clients used if not os.path.exists(os.path.dirname(output_directory)): try: os.makedirs(os.path.dirname(output_directory)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel nicks_for_the_day = [] print "Working on " + filePath '''Getting all the nicknames in a list''' for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks_for_the_day: nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks_for_the_day)): if nicks_for_the_day[i][1:-1] not in nicks: nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames for i in xrange(0,len(nicks)): nicks[i] = correctLastCharCR(nicks[i]) for line in content: if(line[0]=='=' and "changed the topic of" not in line): nick1=line[line.find("=")+1:line.find(" is")] nick2=line[line.find("wn as")+1:line.find("\n")] nick1=nick1[3:] nick2=nick2[5:] nick1=correctLastCharCR(nick1) nick2=correctLastCharCR(nick2) if nick1 not in nicks: nicks.append(nick1) if nick2 not in nicks: nicks.append(nick2) for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")] line2=line[line.find("wn as")+1:line.find("\n")] line1=line1[3:] line2=line2[5:] line1=correctLastCharCR(line1) line2=correctLastCharCR(line2) for i in range(MAX_EXPECTED_DIFF_NICKS): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: if line1 in nick_same_list[i] and line2 not in nick_same_list[i]: nick_same_list[i].append(line2) break if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: nick_same_list[i].append(line1) break if line2 in nick_same_list[i] and line1 in nick_same_list[i]: break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break for ni in nicks: for ind in range(MAX_EXPECTED_DIFF_NICKS): if ni in nick_same_list[ind]: break if not nick_same_list[ind]: nick_same_list[ind].append(ni) break G = to_graph(nick_same_list) L = connected_components(G) for i in range(1,len(L)+1): L[i-1] = [str(i)]+L[i-1] for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel print(filePath) for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m=re.search(r"\<(.*?)\>", line) var=m.group(0)[1:-1] var=correctLastCharCR(var) for d in range(MAX_EXPECTED_DIFF_NICKS): if ((d < len(L)) and (var in L[d])): #change nick_same_list to L because L is the main list of all users and nicks now nick_sender = L[d][0] break for i in nicks: rec_list=[e.strip() for e in line.split(':')] rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: break for k in xrange(0,len(rec_list)): if(rec_list[k]): rec_list[k]=correctLastCharCR(rec_list[k]) for z in rec_list: if(z==i): if(var != i): for d in range(MAX_EXPECTED_DIFF_NICKS): if ((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for r in xrange(0,MAX_EXPECTED_DIFF_NICKS): if (nick_sender in conversations[r] and nick_receiver in conversations[r]): if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]): conversations[r][0]=conversations[r][0]+1 break if(len(conversations[r])==1): conversations[r].append(nick_sender) conversations[r].append(nick_receiver) conversations[r][0]=conversations[r][0]+1 break if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for ij in xrange(0,len(rec_list_2)): #changed variable from i to ij as i has been used above. We are in nested for loop. Same variables name will overlap. if(rec_list_2[ij]): rec_list_2[ij] = correctLastCharCR(rec_list_2[ij]) for j in rec_list_2: if(j==i): if(var != i): for d in range(MAX_EXPECTED_DIFF_NICKS): if i in L[d]: nick_receiver=L[d][0] break for r in xrange(0,MAX_EXPECTED_DIFF_NICKS): if (nick_sender in conversations[r] and nick_receiver in conversations[r]): if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]): conversations[r][0]=conversations[r][0]+1 break if(len(conversations[r])==1): conversations[r].append(nick_sender) conversations[r].append(nick_receiver) conversations[r][0]=conversations[r][0]+1 break if(flag_comma == 0): rec=line[line.find(">")+1:line.find(", ")] rec=rec[1:] rec = correctLastCharCR(rec) if(rec==i): if(var != i): for d in range(MAX_EXPECTED_DIFF_NICKS): if i in L[d]: nick_receiver=L[d][0] break for r in xrange(0,MAX_EXPECTED_DIFF_NICKS): if (nick_sender in conversations[r] and nick_receiver in conversations[r]): if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]): conversations[r][0]=conversations[r][0]+1 break if(len(conversations[r])==1): conversations[r].append(nick_sender) conversations[r].append(nick_receiver) conversations[r][0]=conversations[r][0]+1 break for index in xrange(0,MAX_EXPECTED_DIFF_NICKS): if(len(conversations[index])==3): aggregate_graph.add_edge(conversations[index][1],conversations[index][2],weight=conversations[index][0]) # print("========> nicks") # print(nicks) # print("========> nick_same_list") # print(nick_same_list) # print("========> conversations") # print(conversations) for u,v,d in aggregate_graph.edges(data=True): d['label'] = d.get('weight','') output_file=output_directory+channel_name+"_2013_"+str(startingMonth)+"_"+str(endingMonth)+"_aggregategraph.png" print "Generating "+output_file print "Please wait ...." A = nx.to_agraph(aggregate_graph) A.layout(prog='dot') A.draw(output_file) print("Done Generating")
def createMessageTimeGraph(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): # out_dir_msg_time = output_directory+"message-time/" out_dir_msg_time = output_directory if not os.path.exists(os.path.dirname(out_dir_msg_time)): try: os.makedirs(os.path.dirname(out_dir_msg_time)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise rem_time= None #remembers the time of the last message of the file parsed before the current file nick_same_list=[[] for i in range(5000)] #x nicks = [] #list of all the nicknames for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel nicks_for_the_day = [] ''' Getting all the nicknames in a list nicks[] ''' for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks_for_the_day: nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks_for_the_day)): if nicks_for_the_day[i][1:-1] not in nicks: nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames for i in xrange(0,len(nicks)): nicks[i] = correctLastCharCR(nicks[i]) for line in content: if(line[0]=='=' and "changed the topic of" not in line): nick1=line[line.find("=")+1:line.find(" is")] nick2=line[line.find("wn as")+1:line.find("\n")] nick1=nick1[3:] nick2=nick2[5:] nick1=correctLastCharCR(nick1) nick2=correctLastCharCR(nick2) if nick1 not in nicks: nicks.append(nick1) if nick2 not in nicks: nicks.append(nick2) for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")] line2=line[line.find("wn as")+1:line.find("\n")] line1=line1[3:] line2=line2[5:] line1=correctLastCharCR(line1) line2=correctLastCharCR(line2) for i in range(5000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: if line1 in nick_same_list[i] and line2 not in nick_same_list[i]: nick_same_list[i].append(line2) break if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: nick_same_list[i].append(line1) break if line2 in nick_same_list[i] and line1 in nick_same_list[i]: break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break for ni in nicks: for ind in range(5000): if ni in nick_same_list[ind]: break if not nick_same_list[ind]: nick_same_list[ind].append(ni) break G = to_graph(nick_same_list) L = connected_components(G) for i in range(1,len(L)+1): L[i-1] = [str(i)]+L[i-1] for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel '''=========================== Plotting the conversation graph =========================== ''' graph_conversation = nx.MultiDiGraph() #graph with multiple directed edges between clients used for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var = correctLastCharCR(var) for d in range(5000): if ((d < len(L)) and (var in L[d])): nick_sender = L[d][0] break for i in nicks: rec_list=[e.strip() for e in line.split(':')] #receiver list splited about : rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: #index 0 will contain time 14:02 break for k in xrange(0,len(rec_list)): if(rec_list[k]): #checking for \ rec_list[k] = correctLastCharCR(rec_list[k]) for z in rec_list: if(z==i): if(var != i): for d in range(5000): if ((d < len(L)) and (i in L[d])): nick_receiver=L[d][0] break graph_conversation.add_edge(nick_sender,nick_receiver,weight=line[1:6]) if "," in rec_list[1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for y in xrange(0,len(rec_list_2)): if(rec_list_2[y]): #checking for \ rec_list_2[y]=correctLastCharCR(rec_list_2[y]) for j in rec_list_2: if(j==i): if(var != i): for d in range(5000): if i in L[d]: nick_receiver=L[d][0] break graph_conversation.add_edge(nick_sender,nick_receiver,weight=line[1:6]) if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi! rec=line[line.find(">")+1:line.find(", ")] rec=rec[1:] rec=correctLastCharCR(rec) if(rec==i): if(var != i): for d in range(5000): if i in L[d]: nick_receiver=L[d][0] break graph_conversation.add_edge(nick_sender,nick_receiver,weight=line[1:6]) for u,v,d in graph_conversation.edges(data=True): d['label'] = d.get('weight','') output_file=out_dir_msg_time+channel_name+"_2013_"+str(folderiterator)+"_"+str(fileiterator)+"_msg_time.png" print "Generated " + output_file A = nx.to_agraph(graph_conversation) A.layout(prog='dot') A.draw(output_file)
gf_idx[t].append(newfamily) else: gf_idx[t] = [newfamily] verbalise("Y", "Results after merging based on sequence identity:") report(gene_families, 5, verbalise=verbalise) ############################################################################# # merge gene families based on Trinity gene groups (solution from SO): G = to_graph(gene_families) # collect new gene families: trinity_pool = {} gf_idx = {} # reset the gene family index to the new groups count = 0 for group in connected_components(G): count += 1 ts = [] for gid in group: ts += geneid_idx[gid] newgf = Gene_family(ts) # set index to find gf for given transcript: for t in newgf: gf_idx[t.td_id] = newgf trinity_pool[newgf] = True # report results: verbalise("Y", "\n\nResults after merging based on Trinity gene assignment:") report(trinity_pool, 5, verbalise=verbalise)
def createMessageNumberGraph(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): nick_same_list=[[] for i in range(5000)] #list of list with each list having all the nicks for that particular person nicks = [] #list of all the nicknames # out_dir_msg_num = output_directory+"number-of-messages/" out_dir_msg_num = output_directory if not os.path.exists(os.path.dirname(out_dir_msg_num)): try: os.makedirs(os.path.dirname(out_dir_msg_num)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel nicks_for_the_day = [] print "Working on " + filePath '''Getting all the nicknames in a list''' for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks_for_the_day: nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks_for_the_day)): if nicks_for_the_day[i][1:-1] not in nicks: nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames for i in xrange(0,len(nicks)): nicks[i] = correctLastCharCR(nicks[i]) for line in content: if(line[0]=='=' and "changed the topic of" not in line): nick1=line[line.find("=")+1:line.find(" is")] nick2=line[line.find("wn as")+1:line.find("\n")] nick1=nick1[3:] nick2=nick2[5:] nick1=correctLastCharCR(nick1) nick2=correctLastCharCR(nick2) if nick1 not in nicks: nicks.append(nick1) if nick2 not in nicks: nicks.append(nick2) for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")] line2=line[line.find("wn as")+1:line.find("\n")] line1=line1[3:] line2=line2[5:] line1=correctLastCharCR(line1) line2=correctLastCharCR(line2) for i in range(5000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: if line1 in nick_same_list[i] and line2 not in nick_same_list[i]: nick_same_list[i].append(line2) break if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: nick_same_list[i].append(line1) break if line2 in nick_same_list[i] and line1 in nick_same_list[i]: break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break for ni in nicks: for ind in range(5000): if ni in nick_same_list[ind]: break if not nick_same_list[ind]: nick_same_list[ind].append(ni) break G = to_graph(nick_same_list) L = connected_components(G) for i in range(1,len(L)+1): L[i-1] = [str(i)]+L[i-1] for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel print(filePath) conversations=[[] for i in range(100)] #xarr for i in xrange(0,100): conversations[i].append(0) for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var = correctLastCharCR(var) for d in range(len(nicks)): if((d < len(L)) and (var in L[d])): nick_sender = L[d][0] break for i in nicks: rec_list=[e.strip() for e in line.split(':')] rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: break for x in xrange(0,len(rec_list)): if(rec_list[x]): rec_list[x] = correctLastCharCR(rec_list[x]) for z in rec_list: if(z==i): if(var != i): for d in range(len(nicks)): if((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for k in xrange(0,100): if (nick_sender in conversations[k] and nick_receiver in conversations[k]): if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]): conversations[k][0]=conversations[k][0]+1 break if(len(conversations[k])==1): conversations[k].append(nick_sender) conversations[k].append(nick_receiver) conversations[k][0]=conversations[k][0]+1 break if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for y in xrange(0,len(rec_list_2)): if(rec_list_2[y]): rec_list_2[y] = correctLastCharCR(rec_list_2[y]) for j in rec_list_2: if(j==i): if(var != i): for d in range(len(nicks)): if i in L[d]: nick_receiver=L[d][0] break for k in xrange(0,100): if (nick_sender in conversations[k] and nick_receiver in conversations[k]): if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]): conversations[k][0]=conversations[k][0]+1 break if(len(conversations[k])==1): conversations[k].append(nick_sender) conversations[k].append(nick_receiver) conversations[k][0]=conversations[k][0]+1 break if(flag_comma == 0): rec=line[line.find(">")+1:line.find(", ")][1:] rec = correctLastCharCR(rec) if(rec==i): if(var != i): for d in range(len(nicks)): if i in L[d]: nick_receiver=L[d][0] break for k in xrange(0,100): if (nick_sender in conversations[k] and nick_receiver in conversations[k]): if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]): conversations[k][0]=conversations[k][0]+1 break if(len(conversations[k])==1): conversations[k].append(nick_sender) conversations[k].append(nick_receiver) conversations[k][0]=conversations[k][0]+1 break msg_num_graph = nx.DiGraph() #graph with multiple directed edges between clients used for y in xrange(0,100): if(len(conversations[y])==3): msg_num_graph.add_edge(conversations[y][1],conversations[y][2],weight=conversations[y][0]) for u,v,d in msg_num_graph.edges(data=True): d['label'] = d.get('weight','') output_file=out_dir_msg_num+channel_name+"_2013_"+str(folderiterator)+"_"+str(fileiterator)+"_msg_num.png" print "Generated " + output_file A = nx.drawing.nx_agraph.to_agraph(msg_num_graph) A.layout(prog='dot') A.draw(output_file)
def gcc_size( self ): cc_sizes = list() for cc in connected_components( self._network ): cc_sizes.append( len(cc)) return max( cc_sizes )
def createGephiTimelapseCSV(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """[Deprecated] Produces node and edge csv files that contain information relevant for creating a timelapse of user interactions on Gephi. Most importantly, these csv files contain the node/edge appear and disappear times and can easily be imported into Gephi. """ today1= [] today2 =[] col1 = [] col2 = [] col3 = [] col4 = [] col_edge1 = [] col_edge2 = [] col_edge3 = [] col_edge4 = [] col_edge5 = [] col_edge6 = [] for i in range(0,365): dt1= datetime.datetime.now() - datetime.timedelta(days=3*365 + 105 - i) #We are basically trying to append all the 365 dates into today1 and today2. This is dependent on the day you are writing this code. 105 means #I was writing the code after 105 days from 1 Jan 2016. For making the Gephi timelapse work, we need to get the times in the dd-mm-yy h:m:s format # My advice would be to try Gephi yourself first and then come back to the code. today1.append(dt1.strftime('%Y-%m-%d')) dt2= datetime.datetime.now() - datetime.timedelta(days=3*365 + 105 - i) today2.append(dt2.strftime('%Y-%m-%d')) #for xv in range(len(today1)): #If we want our graphs to simply change on a daily basis then we can append 00:00:01 to appear time. This means that irrespective #today1[xv]= today1[xv] + " 00:00:01" #of when an edge appears on a particular day, the edge will appear throughout 00:00:01 to 23:59:59 of that day. # But from the log files we know when an edge appears, so we will use that to make out timelapse more meaningful and precise. for xz in range(len(today2)): today2[xz]= today2[xz] + " 23:59:59" #However once an edge appears, it only disappears when the day ends i.e. 23:59:59. We can change this as per our wish. nick_same_list=[[] for i in range(5000)] nicks = [] #list of all the nicknames my_sum = 0 for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel send_time = [] #list of all the times a user sends a message to another user nicks_for_the_day = [] print(filePath+ "For Nicks") #code for getting all the nicknames in a list for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks_for_the_day: nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks_for_the_day)): if nicks_for_the_day[i][1:-1] not in nicks: nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames for i in xrange(0,len(nicks)): if(len(nicks[i])!=0): nicks[i]=correctLastCharCR(nicks[i]) for j in content: if(j[0]=='=' and "changed the topic of" not in j): line1=j[j.find("=")+1:j.find(" is")] line2=j[j.find("wn as")+1:j.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=correctLastCharCR(line1) if(len(line2)!=0): line2=correctLastCharCR(line2) if line1 not in nicks: nicks.append(line1) if line2 not in nicks: nicks.append(line2) #code for forming list of lists for avoiding nickname duplicacy for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")] line2=line[line.find("wn as")+1:line.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=correctLastCharCR(line1) if(len(line2)!=0): line2=correctLastCharCR(line2) for i in range(5000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: if line1 in nick_same_list[i] and line2 not in nick_same_list[i]: nick_same_list[i].append(line2) break if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: nick_same_list[i].append(line1) break if line2 in nick_same_list[i] and line1 in nick_same_list[i]: break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break #print(x) for ni in nicks: for ind in range(5000): if ni in nick_same_list[ind]: break if not nick_same_list[ind]: nick_same_list[ind].append(ni) break #print("*********************x**********************************") #print(nick_same_list) G = to_graph(nick_same_list) L = list(connected_components(G)) for i in range(1,len(L)+1): L[i-1] = list(L[i-1]) #The explanation for the aforementioned code has already been given in parser-RT.py. #Lines 190-233 are for the nodes in Gephi. We make sure that all nodes are always present throughout our timelapse. For this reason the appear #and disappear times for all the nodes are 1 Jan 00:00:01 to 1 Feb 00:00:01 (our timelapse is for a month). If we want our nodes to appear or disappear at some other times # change their values here itself. Our aim is to store all these tables in a csv file. Gephi imports these node and edge csv #files, parses them and produces the timelapse. createvar = -1 for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel createvar = createvar + 1 nicks_for_the_day_2 = [] print(filePath+ "For Nodes") #code for getting all the nicknames in a list for il in content: if(il[0] != '=' and "] <" in il and "> " in il): m = re.search(r"\<(.*?)\>", il) if m.group(0) not in nicks_for_the_day_2: nicks_for_the_day_2.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for nx in nicks_for_the_day_2: for dk in range(len(nicks)): if (dk<len(L) and nx[1:-1] in L[dk]): col1.append(str(L[dk][0])) col2.append(str(L[dk][0])) #col3.append(today1[createvar]) #col4.append(today2[createvar]) col3.append("2013-01-01 00:00:01") #Nodes stay throughout the month. Col3 and Col4 are the appear and disappear times. Hardcode them ;) col4.append("2013-02-01 00:00:01") break rows = zip(col1,col2,col3,col4) # We store everything in a csv file which we will later import to Gephi. We are Gephi's slaves! with open('/home/dhruvie/LOP/nodesgephi_unchained.csv', 'a+') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for ro in rows: wr.writerow(ro) #The below code similarly is for obtaining the edge csv file which we is imported to gephi later. The difference here #is that we know when the edges appear. We obtain this from the log file(line[1:6]), we just append :00 to satisfy the hh:mm:ss format, Gephi wants. createvar_used=-1 for folderiterator in range(startingMonth, endingMonth+1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel createvar_used=createvar_used+1 print(filePath+ "For Edges") nickplots = [] #nickplots stores all those nicks from nicks[] list that do not have zero in and outdegree in our conversation graph indegree = [] #this list stores all the indegree corresponding to a particular nick outdegree = [] #this list stores all the outdegree corresponding to a particular nick # G1 = nx.MultiDiGraph() for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var=correctLastCharCR(var) for d in range(len(nicks)): if (d<len(L) and var in L[d]): nick_sender = L[d][0] break for i in nicks: rec_list=[e.strip() for e in line.split(':')] rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: break for ik in xrange(0,len(rec_list)): if(rec_list[ik]): rec_list[ik]=correctLastCharCR(rec_list[ik]) for z in rec_list: if(z==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if i in L[d]: nick_receiver=L[d][0] break col_edge1.append(nick_sender) col_edge2.append(nick_receiver) #We are going to add all these columns in a csv file for Gephi's use. col_edge3.append("Directed") col_edge4.append(1.0) col_edge5.append(today1[createvar_used]+" "+line[1:6]+":00") col_edge6.append(today2[createvar_used]) my_sum=my_sum+1 # G1.add_edge(nick_sender,nick_receiver,weight=line[1:6]) if nick_sender not in nickplots: nickplots.append(nick_sender) if nick_receiver not in nickplots: #Right time to append to nickplots. nickplots.append(nick_receiver) if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for ij in xrange(0,len(rec_list_2)): if(rec_list_2[ij]): rec_list_2[ij]=correctLastCharCR(rec_list_2[ij]) for j in rec_list_2: if(j==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if i in L[d]: nick_receiver=L[d][0] break col_edge1.append(nick_sender) col_edge2.append(nick_receiver) col_edge3.append("Directed") col_edge4.append(1.0) col_edge5.append(today1[createvar_used]+" "+line[1:6]+":00") col_edge6.append(today2[createvar_used]) my_sum=my_sum+1 # G1.add_edge(nick_sender,nick_receiver,weight=line[1:6]) if nick_sender not in nickplots: nickplots.append(nick_sender) if nick_receiver not in nickplots: nickplots.append(nick_receiver) if(flag_comma == 0): rec=line[line.find(">")+1:line.find(", ")] rec=rec[1:] rec=correctLastCharCR(rec) if(rec==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if i in L[d]: nick_receiver=L[d][0] break col_edge1.append(nick_sender) col_edge2.append(nick_receiver) col_edge3.append("Directed") col_edge4.append(1.0) col_edge5.append(today1[createvar_used]+" "+line[1:6]+":00") col_edge6.append(today2[createvar_used]) my_sum=my_sum+1 # G1.add_edge(nick_sender,nick_receiver,weight=line[1:6]) if nick_sender not in nickplots: nickplots.append(nick_sender) if nick_receiver not in nickplots: nickplots.append(nick_receiver) edge_rows = zip(col_edge1,col_edge2,col_edge3,col_edge4,col_edge5,col_edge6) with open('/home/dhruvie/LOP/edgesgephi_unchained.csv', 'a+') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for rz in edge_rows: wr.writerow(rz)
#throughout a year. for ni in nicks: for ind in range(7000): if ni in x[ind]: break if not x[ind]: x[ind].append(ni) break #Append all the nicks from nicks array which are not already present in x. These would be the ones who did not change their nicks throughout the #year. G = to_graph(x) #function definition is on top L = connected_components(G) for i in range(1,len(L)+1): L[i-1] = [i]+L[i-1] # We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g. #Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user. xarr=[[] for i in range(10000)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too. graph_to_sir = [] ## I would advice on using a different data structure which does not have an upper bound like we do in arrays. graph_x_axis = [] graph_y_axis = [] graphx1 =[] graphy1 =[]
def findConvLength_ConvRefreshTime(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): nick_same_list=[[] for i in range(7000)] nicks = [] #list of all the nicknames conv = [] conv_diff = [] # out_dir_msg_num = output_directory+"CL/" out_dir_msg_num = output_directory if not os.path.exists(os.path.dirname(out_dir_msg_num)): try: os.makedirs(os.path.dirname(out_dir_msg_num)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name send_time = [] #list of all the times a user sends a message to another user nicks_for_the_day = [] print(filePath) #code for getting all the nicknames in a list for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks_for_the_day: nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks_for_the_day)): if nicks_for_the_day[i][1:-1] not in nicks: nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames for i in xrange(0,len(nicks)): if(len(nicks[i])!=0): nicks[i]=correctLastCharCR(nicks[i]) for j in content: if(j[0]=='=' and "changed the topic of" not in j): line1=j[j.find("=")+1:j.find(" is")] line2=j[j.find("wn as")+1:j.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=correctLastCharCR(line1) if(len(line2)!=0): line2=correctLastCharCR(line2) if line1 not in nicks: nicks.append(line1) if line2 not in nicks: nicks.append(line2) #code for forming list of lists for avoiding nickname duplicacy for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")] line2=line[line.find("wn as")+1:line.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=correctLastCharCR(line1) if(len(line2)!=0): line2=correctLastCharCR(line2) for i in range(7000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: if line1 in nick_same_list[i] and line2 not in nick_same_list[i]: nick_same_list[i].append(line2) break if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: nick_same_list[i].append(line1) break if line2 in nick_same_list[i] and line1 in nick_same_list[i]: break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break for ni in nicks: for ind in range(7000): if ni in nick_same_list[ind]: break if not nick_same_list[ind]: nick_same_list[ind].append(ni) break G = to_graph(nick_same_list) L = connected_components(G) for i in range(1,len(L)+1): L[i-1] = [i]+L[i-1] # We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g. #Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user. conversations=[[] for i in range(10000)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too. graph_to_sir = [] ## I would advice on using a different data structure which does not have an upper bound like we do in arrays. graph_x_axis = [] graph_y_axis = [] graphx1 =[] graphy1 =[] graphx2 =[] graphy2 =[] dateadd=-1 #Variable used for response time calculation. Varies from 0-365. for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name dateadd=dateadd+1 send_time = [] #list of all the times a user sends a message to another user meanstd_list = [] totalmeanstd_list = [] x_axis = [] y_axis = [] real_y_axis = [] time_in_min = [[] for i in range(1000)] print(filePath) #code for making relation map between clients for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var=correctLastCharCR(var) for d in range(len(nicks)): #E.g. if names are rohan1,rohan2,rohan3...,then var will store rohan1. if((d < len(L)) and (var in L[d])): nick_sender = L[d][0] break for i in nicks: rec_list=[e.strip() for e in line.split(':')] rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: break for ik in xrange(0,len(rec_list)): if(rec_list[ik]): rec_list[ik]=correctLastCharCR(rec_list[ik]) for z in rec_list: if(z==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for rt in xrange(0,10000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) # We add response times in conversations for every conversation break #between userA and userB. If they havent already conversed if(len(conversations[rt])==0): #before than add time at a new array index and later append to it. conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for ij in xrange(0,len(rec_list_2)): if(rec_list_2[ij]): rec_list_2[ij]=correctLastCharCR(rec_list_2[ij]) for j in rec_list_2: if(j==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if((d<len(L)) and (i in L[d])): #Lines 212-255 consider all cases in which messages are addressed such as - nick1:nick2 or nick1,nick2, nick_receiver=L[d][0] #or nick1,nick2: break for rt in xrange(0,10000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break if(len(conversations[rt])==0): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break if(flag_comma == 0): rec=line[line.find(">")+1:line.find(", ")] rec=rec[1:] rec=correctLastCharCR(rec) if(rec==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if ((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for rt in xrange(0,10000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break if(len(conversations[rt])==0): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break #Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year. for ty in range(0,len(conversations)): #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs) if(len(conversations[ty])!=0): # response times are calculated starting from index 2. So now we have all the response times in conversations. del conversations[ty][0:2] for fg in range(0,len(conversations)): if(len(conversations[fg])!=0): first=conversations[fg][0] for gh in range(1,len(conversations[fg])): if(conversations[fg][gh]-conversations[fg][gh-1]>9): conv.append(conversations[fg][gh-1]-first) #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response #time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code. conv_diff.append(conversations[fg][gh]-conversations[fg][gh-1]) first=conversations[fg][gh] if(gh==(len(conversations[fg])-1)): conv.append(conversations[fg][gh]-first) break for op in range(0,max(conv)): graphx1.append(op) graphy1.append(conv.count(op)) for po in range(0,max(conv_diff)): graphx2.append(po) graphy2.append(conv_diff.count(po)) #To plot CDF we store the CL and CRT values and their number of occurences as shown above. row_cl = zip(graphx1,graphy1) filename1= out_dir_msg_num+channel_name+"_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+"_CL.csv" with open(filename1, 'a+') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for row in row_cl: wr.writerow(row) row_crt = zip(graphx2,graphy2) filename2= out_dir_msg_num+channel_name+"_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+"_CRT.csv" with open(filename2, 'a+') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for row in row_crt: wr.writerow(row)