Beispiel #1
0
def run(fileName, n_hashes, n_buckets):
    """ Starts the main LSH process.

    Args:
        fileName (string): path of text file to read
        n_hashes (int): number of hash functions to generate
        n_buckets (int): number of buckets to use

    Returns:
        Vector: buckets of minhash values
    """
    sc = SparkContext(conf=SparkConf())
    hashes = sc.broacast(getHashFunctions(n_hashes))

    text = sc.textFile(fileName)
    stopWords = sc.textFile('path/to/stopwords')  # Test
    cleanData = text.map(removePunctuation).subtract(stopWords).cache()