Example #1
0
def find_ids(nodes, *args):
	if not args:
		return [-1]
	bar = StatusBar(len(nodes))
	regex = re.compile(r'^.*?\tartist\t(%s)\n' % '|'.join(args), re.IGNORECASE)
	ids = []
	for index, row in enumerate(nodes):
		if regex.match(row):
			ids.append(index)
		if index % 10000 == 0: bar.update(index)
	bar.close()
	return ids
Example #2
0
File: merge.py Project: Wolff09/nap
def merge(nodes, edges, components):
	merged = []
	for list, type in ((nodes, 'n'), (edges, 'e')):
		bar = StatusBar(len(list))
		counter = 0
		while list:
			line = list.pop() # modify input list to reduce memory consumption
			if line:
				merged.append("%s\t%s\t%s" % (components[get_id(line)], type, line))
			counter += 1
			if counter % 10000 == 0: bar.update(counter)
		bar.close()
	return merged
Example #3
0
def delete(nodes, edges, *deletion_indices):
	from datetime import datetime
	for index in deletion_indices:
		nodes[index] = None
	bar = StatusBar(len(edges))
	deletion_indices = set(deletion_indices) # set for constant time 'in' check
	for i, row in enumerate(edges):
		data = row.split("\t", 2)
		left = int(data[0])
		right = int(data[1])
		if left in deletion_indices or right in deletion_indices:
			edges[i] = None
		if i % 10000 == 0: bar.update(i)
	bar.close()
Example #4
0
	def read_lines(path, approx=10000000):
		bar = StatusBar(approx)
		lines = []
		counter = 0
		with open(path) as file:
			file.readline() # drop header
			for line in file:
				lines.append(line)
				counter += 1
				if counter % 10000 == 0: bar.update(counter)
			if not lines[-1].endswith("\n"):
				lines[-1] += "\n"
		bar.close()
		return lines
Example #5
0
def compute(nodes, edges):
	parents = make_sets(len(nodes))

	bar = StatusBar(len(edges))
	counter = 0
	for line in edges:
		if line:
			first_delimiter = line.find("\t")
			second_delimiter = line.find("\t", first_delimiter + 1)
			left = int(line[:first_delimiter])
			right = int(line[first_delimiter:second_delimiter])
			union(parents, left, right)
		counter += 1
		if counter % 5000 == 0: bar.update(counter)

	bar.close()
	bar = StatusBar(len(parents))
	for counter, x in enumerate(parents):
		parents[counter] = find(parents, x)
		if counter % 10000 == 0: bar.update(counter)
	bar.close()
	return parents
Example #6
0
def process_data(path_to_nodes, path_to_edges, path_to_output, *deletion_names):
	"""
	Process the given data to be able to use the graph structure
	with networkx while not allocating over 9000MB of RAM.

	The nodes of the input data must have continuous ids.
	Furthermore, artist entries are expected to not end with
	a \t. Otherwise an entry which should be deleted might not
	be deleted.

	The data undergoes the following steps.
		Step 1: read data into memory
		Step 2: delete nodes that match a given name
		Step 3: delete edges adjacent to nodes deleted in Step 2
		Step 4: find connected components
		Step 5: merge nodes and edges
		Step 6: sort
		Step 7: output to file
	"""
	begin = datetime.now()

	# Step 1
	def read_lines(path, approx=10000000):
		bar = StatusBar(approx)
		lines = []
		counter = 0
		with open(path) as file:
			file.readline() # drop header
			for line in file:
				lines.append(line)
				counter += 1
				if counter % 10000 == 0: bar.update(counter)
			if not lines[-1].endswith("\n"):
				lines[-1] += "\n"
		bar.close()
		return lines
	print ">>> Reading nodes and edges..."
	nodes = read_lines(path_to_nodes, approx=10000000)
	edges = read_lines(path_to_edges, approx=27000000)

	# Step 2 and 3
	if deletion_names:
		print ">>> Searching ids voted for deletion..."
		deletion_ids = various_artists.find_ids(nodes, *deletion_names)
		print ">>> Deleting nodes and edges..."
		various_artists.delete(nodes, edges, *deletion_ids)

	# Step 4
	print ">>> Searching for connected components..."
	components = connected_components.compute(nodes, edges)

	# Step 5
	print ">>> Merging nodes and edges..."
	merged = merge.merge(nodes, edges, components)
	del nodes
	del edges

	# Step 6
	print ">>> Sorting according to connected components..."
	merged.sort()

	# Step 7
	print ">>> Writing to file..."

	bar = StatusBar(len(merged))
	counter = 0
	with open(path_to_output, "w") as file:
		file.write(HEADER)
		for line in merged:
			file.write(line)
			counter += 1
			if counter % 10000 == 0: bar.update(counter)
		file.close()
	bar.close()
	del merged

	# say goodbye
	diff = datetime.now() - begin
	print ">>> Jobs Done! [%s]" % str(timedelta(seconds=int(diff.total_seconds())))
	return