Esempio n. 1
0
def train_clustering_parameters(vector_inpath):
    """
	Functions that tries to figure out the optimal clustering parameters in regard to DBSCAN's epsilon,
	min_samples and p.

	Args:
		vector_inpath (str): Path to vector file. File has to have the following format (separated by spaces):
			<index of original vector #1> <index of original vector #2> <Dimension 1> ... <Dimension n>
	"""
    x0 = numpy.array([2.5, 20, 2])  # First parameters
    print alt("Load mappings...")
    indices, model = load_mappings_from_model(vector_inpath)
    X = numpy.array([model[key]
                     for key in indices])  # Arrange data for optimization
    print alt("Start training...")

    # Cluster data and calculate loss as silhouette coefficient per cluster
    def simple_clustering(x):
        # Start clustering
        print alt("Current parameters: %s" % (str(x)))
        dbscan = DBSCAN(eps=x[0], min_samples=x[1], p=x[2])
        dbscan.fit(X)

        # Evaluate results
        cluster_sizes = get_cluster_size(dbscan.labels_)
        print alt("Current cluster sizes: %s" % (cluster_sizes))
        sscore = silhouette_score(X, dbscan.labels_)
        tscore = (sscore / (len(cluster_sizes.keys()) - 1))
        print alt("Current value of objective function: %.5f" % (tscore))
        print "-" * 50
        return tscore

    # Start minimizing
    result = minimize(simple_clustering, x0, method="Nelder-Mead")
    print result.x  # Print resulting parameter configuration
Esempio n. 2
0
def cluster_mappings(vector_inpath,
                     do_pca=False,
                     target_dim=100,
                     indices_inpath=None,
                     epsilon=2.625,
                     min_s=20):
    """
	Cluster mapping vectors created with :py:mod:`src.mapping.mapthreading` or :py:mod:`rc.mapping.map_vectors.py`.
	Because just reading about the number of clusters and their sizes, there's an option to resolve the indices of
	the vectors in the cluster to their original word pairs.

	Args:
		vector_inpath (str): Path to vector file. File should have the following format (separated by spaces):
			<index of original vector #1> <index of original vector #2> <Dimension 1> ... <Dimension n>
		do_pca (bool): Flag to indicate whether PCA should be executed before clustering to reduce amount of
		computation.
		target_dim (int): Number of dimensions vectors should be shrunk to in case PCA is performed.
		indices_inpath (str): Path to file with the indices given to words. The file should have the following format:
			<index of word>	<word> (separated by tab)
		epsilon (float): Radius of circle DBSCAN uses to look for other data points.
		min_s (int): Minimum number of points in radius epsilon DBSCAN needs to declare a point a core object.
	"""
    print alt("Load mappings...")
    indices, model = load_mappings_from_model(vector_inpath)
    X = numpy.array([model[key] for key in indices])
    del model  # free up memory

    # do PCA if wanted
    if do_pca:
        print alt("Truncate vectors with PCA to %i dimensions..." % target_dim)
        pca = PCA(n_components=target_dim)
        pca.fit(X)
        X = pca.transform(X)

    # Start clustering
    print alt("Cluster points...")
    dbscan = DBSCAN(eps=epsilon, min_samples=min_s, p=2)
    dbscan.fit(X)

    # Get results
    labels = dbscan.labels_
    print alt("Cluster sizes:")
    print get_cluster_size(labels)
    print alt("Finished clustering!")
    sscore = silhouette_score(X, labels)
    print("Silhouette Coefficient: %0.3f" % (sscore))

    # Resolve indices and print all word-pairs in clusters if wanted
    if indices_inpath:
        resolve_indices(indices, labels, indices_inpath, model)
Esempio n. 3
0
def convert_decow_to_plain(decow_dir, out_dir, log_path, merge_nes,
                           log_interval):
    """
	Convert the whole corpus into plain text.

	Args:
		decow_dir (str): Path to directory with decow corpus paths.
		out_dir (str): Path where plain text parts should be written to.
		log_path (str): Path where the log files should be written to.
		merge_nes (bool): Flag to indicate whether multi-word expression should be merged with underscores.
		log_interval (int): Interval to log current process state in seconds.
	"""
    # Split logging interval into hourse - minutes - seconds
    m_proc, s_proc = divmod(log_interval, 60)
    h_proc, m_proc = divmod(m_proc, 60)

    # Init logfile
    with codecs.open(log_path, "a", "utf-8") as log_file:
        log_file.write(alt("Starting logging...\n"))
        log_file.write(alt("Corpus (parts) directory:\t%s\n" % decow_dir))
        log_file.write(alt("Output directory:\t\t%s\n" % out_dir))
        log_file.write(alt("Logging path:\t\t%s\n" % log_path))
        log_file.write(
            alt("Logging intervals:\n\t Every %2dh %2dm %2ds for metalog\n" %
                (h_proc, m_proc, s_proc)))

    # Start processes
    @log_time(log_path, log_interval)
    def _convert_decow_to_plain(decow_dir, out_dir, log_path, merge_nes,
                                log_interval):
        with codecs.open(log_path, "a", "utf-8") as log_file:
            log_file.write(
                alt("Preparing %i process(es)...\n" % (len(decow_dir))))
            inpaths = [
                path for path in os.listdir(decow_dir)
                if ".DS_Store" not in path and "decow" in path
            ]
            pool = multiprocessing.Pool(processes=len(inpaths))
            log_file.write(alt("Starting process(es)!\n"))
            if merge_nes:
                pool.map(convert_part_merging,
                         [(decow_dir + inpath, out_dir, log_path, log_interval)
                          for inpath in inpaths])
            else:
                pool.map(convert_part,
                         [(decow_dir + inpath, out_dir, log_path, log_interval)
                          for inpath in inpaths])

    _convert_decow_to_plain(decow_dir, out_dir, log_path, merge_nes,
                            log_interval)
Esempio n. 4
0
    def _convert_part_merging(inpath, dir_outpath, log_path):
        with codecs.open(log_path, "a", "utf-8") as log_file:
            process_name = multiprocessing.current_process().name
            log_file.write(
                alt("%s: Start logging processing of\n\t%s to \n\t%s...\n" %
                    (process_name, inpath, dir_outpath)))
            file_n = get_file_number(inpath)
            outpath = dir_outpath + 'decow%s_out.txt' % (str(file_n))

            with gz.open(inpath,
                         'rb') as infile, codecs.open(outpath, 'wb',
                                                      'utf-8') as outfile:
                sentence = []
                line, lcount = infile.readline().strip().decode("utf-8"), 1

                while line != "":
                    if lcount % 100000 == 0:
                        log_file.write(
                            alt("%s: Processing line nr. %i...\n" %
                                (process_name, lcount)))

                    ne = extract_named_entity(
                        line)  # Extract possible named entity

                    if line.startswith(u'<s'):
                        outfile.write('%s\n' % (' '.join(sentence)))
                        sentence = []
                    # If there was a named entity found, try to complete it if it's a multi-word expression
                    elif ne is not None:
                        while True:
                            next_line = infile.readline().strip().decode(
                                "utf-8")
                            lcount += 1
                            if not contains_tag(next_line):
                                next_ne = extract_named_entity(next_line)
                                if next_ne is not None and next_ne[1] == ne[1]:
                                    ne = ("%s_%s" % (ne[0], next_ne[0]), ne[1])
                                else:
                                    break
                            else:
                                break
                        sentence.append(ne[0])
                        line = next_line
                        continue
                    elif not line.startswith(u'<'):
                        sentence.append(line.split('\t')[0])
                    line, lcount = infile.readline().strip().decode(
                        "utf-8"), lcount + 1
Esempio n. 5
0
def load_indices(indices_inpath):
    """
	Load word indices from a file. The file should have the following format: <index of word>	<word> (separated by
	tab)

	Args:
		indices_inpath (str): Path to index file.
	"""
    print alt("Load indices...")
    indices = defaultdict(str)
    with codecs.open(indices_inpath, "rb", "utf-8") as indices_inpath:
        line = indices_inpath.readline().strip()
        while line:
            parts = line.split("\t")
            index = int(parts[0])
            word = parts[1].replace(" ", "_")
            indices[index] = word
            line = indices_inpath.readline().strip()
    return indices
Esempio n. 6
0
 def _convert_decow_to_plain(decow_dir, out_dir, log_path, merge_nes,
                             log_interval):
     with codecs.open(log_path, "a", "utf-8") as log_file:
         log_file.write(
             alt("Preparing %i process(es)...\n" % (len(decow_dir))))
         inpaths = [
             path for path in os.listdir(decow_dir)
             if ".DS_Store" not in path and "decow" in path
         ]
         pool = multiprocessing.Pool(processes=len(inpaths))
         log_file.write(alt("Starting process(es)!\n"))
         if merge_nes:
             pool.map(convert_part_merging,
                      [(decow_dir + inpath, out_dir, log_path, log_interval)
                       for inpath in inpaths])
         else:
             pool.map(convert_part,
                      [(decow_dir + inpath, out_dir, log_path, log_interval)
                       for inpath in inpaths])
Esempio n. 7
0
def aggregate_cluster(points, labels):
    """
	Arranges all clusters in a list, where a sublist with all points at index i corresponds with the
	custer with label i.

	Args:
		points (list): List of datapoints
		labels (list): List of unique cluster labels

	Returns:
		list: list of lists of datapoints belonging to the i-th cluster
	"""
    print alt("Aggregate clusters...")
    clusters = defaultdict(tuple)
    for i in range(len(labels)):
        label = labels[i]
        if label == -1:
            continue
        if label in clusters.keys():
            clusters[label].append(points[i])
        else:
            clusters[label] = [points[i]]
    return clusters
Esempio n. 8
0
def word_sim_eval(vector_inpath, wordpair_path, format="google"):
    """
	Function that let's the system assign word pairs a similarity score based on the cosine similarity of their word
	embeddings. Then, to correlation between those and human ratings is measured with Pearson's rho.

	Args:
		vector_inpath (str): Path to vector file. File has to have the following format (separated by spaces):
			<index of original vector #1> <index of original vector #2> <Dimension 1> ... <Dimension n>
		wordpair_path (str): Path to word pair file.
		format (str): Format of word pair file {google|semrel}
	"""
    alt("Loading model...\n")
    model = load_vectors_from_model(vector_inpath)[1]

    # Read word pairs with values
    alt("Loading word pairs...\n")
    raw_pairs, x = read_wordpairs(wordpair_path, format)
    y = [None] * len(raw_pairs)
    pairs = []
    for i in range(len(raw_pairs)):
        pairs.append((i, raw_pairs[i]))

    # Calculate similarity values for pairs
    alt("Calculating word pair similarities...\n")
    error_counter = 0
    for pair_id, pair in pairs:
        try:
            a = model[capitalize(pair[0])]
            b = model[capitalize(pair[1])]
            sim = cosine(a, b)
            y[pair_id] = sim
        except TypeError:
            error_counter += 1

    # Remove pairs where no word embedding was found
    x, y = remove_unknowns(x, y)

    # Calculate results
    rho, t, z = evaluate_wordpair_sims(x, y, len(pairs))

    successful_pairs = len(pairs) - error_counter
    successful_percentage = (len(pairs) -
                             error_counter * 1.0) / len(pairs) * 100.0
    alt("Calculated Pearman's rho for %i pairs (%.2f %%).\n\tr = %.4f\n\tt = %.4f\n\tz = %.4f\n"
        % (successful_pairs, successful_percentage, rho, t, z))
Esempio n. 9
0
    def simple_clustering(x):
        # Start clustering
        print alt("Current parameters: %s" % (str(x)))
        dbscan = DBSCAN(eps=x[0], min_samples=x[1], p=x[2])
        dbscan.fit(X)

        # Evaluate results
        cluster_sizes = get_cluster_size(dbscan.labels_)
        print alt("Current cluster sizes: %s" % (cluster_sizes))
        sscore = silhouette_score(X, dbscan.labels_)
        tscore = (sscore / (len(cluster_sizes.keys()) - 1))
        print alt("Current value of objective function: %.5f" % (tscore))
        print "-" * 50
        return tscore
Esempio n. 10
0
		def wrapper(*args, **kwargs):
			result = [None]

			def return_value(*args, **kwargs):
				result[0] = func(*args, **kwargs)

			t = threading.Thread(target=return_value, args=args, kwargs=kwargs)
			log_entries = 0
			with codecs.open(logpath, "a", "utf-8") as logfile:
				start_time = time.time()
				t.start()
				while t.is_alive():
					elapsed_time = (time.time() - start_time)
					if elapsed_time > interval * log_entries:
						m, s = divmod(elapsed_time, 60)
						h, m = divmod(m, 60)
						logfile.write(alt("Elapsed time for function '%s': %2dh %2dm %2ds\n"
									  %(func.__name__, h, m, s)))
						log_entries += 1
			return result[0]