Example #1
0
def main():
    """
    Create low-dimensional matrix from count matrix by multiplication with random matrix.
    """

    # Get the arguments
    args = docopt(
        '''Create low-dimensional matrix from count matrix by multiplication with random matrix.

    Usage:
        multiply.py [-l] [-c] <countPath> <randomPath> <outPath>

        <countPath> = path to count matrix
        <randomPath> = path to random matrix
        <outPath> = output path for reduced matrix

    Options:
        -l, --len   normalize final vectors to unit length
        -c, --cen   mean center columns of final matrix

    ''')

    is_len = args['--len']
    is_cen = args['--cen']
    countPath = args['<countPath>']
    randomPath = args['<randomPath>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices
    countSpace = Space(countPath)
    countMatrix = countSpace.matrix
    randomSpace = Space(randomPath)
    randomMatrix = randomSpace.matrix

    logging.info("Multiplying matrices")
    reducedMatrix = np.dot(countMatrix, randomMatrix)
    reducedSpace = Space(matrix=reducedMatrix,
                         rows=countSpace.rows,
                         columns=[])

    if is_len:
        logging.info("L2-normalize vectors")
        reducedSpace.l2_normalize()

    if is_cen:
        logging.info("Mean center columns")
        reducedSpace.mean_center()

    # Save the reduced matrix
    reducedSpace.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #2
0
def main():
    """
    Mean center matrix.
    """

    # Get the arguments
    args = docopt('''Mean center matrix.

    Usage:
        center.py [-l] [-w] <matrixPath> <outPath>

        <matrixPath> = path to matrix
        <outPath> = output path for space

    Options:
        -l, --len   normalize vectors to unit length before centering
        -w, --w2v   save in word2vec format

    ''')

    is_len = args['--len']
    is_w2v = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices and rows
    try:
        space = Space(matrixPath, format='npz')
    except ValueError:
        space = Space(matrixPath, format='w2v')

    if is_len:
        # L2-normalize vectors
        space.l2_normalize()

    # Mean center
    space.mean_center()

    # Save the matrix
    if is_w2v:
        space.save(outPath, format='w2v')
    else:
        space.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #3
0
def main():
    """
    Higher-order similarity matrix.
    """

    # Get the arguments
    args = docopt('''Apply the similarity order transformation.

    Usage:
        sot.py [-l] <matrixPath> <outPath> <alpha>

        <matrixPath>    = path to matrix
        <outPath>       = output path for space
        <alpha>         = the desired similarity-order

    Options:
        -l, --len   normalize vectors to unit length before centering

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    alpha = float(args['<alpha>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices and rows
    try:
        space = Space(matrixPath, format='npz')
    except ValueError:
        space = Space(matrixPath, format='w2v')

    # L2-normalize vectors
    if is_len:
        space.l2_normalize()

    # Similarity matrix
    space.transform_similarity_order(alpha)

    # Save the matrix
    space.save(outPath, format="w2v")

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #4
0
def main():
    """
    Mean center matrix, depending on flag, and remove top n PCA components
    """

    # Get the arguments
    args = docopt(
        '''Depending on the flag, mean centers matrix and applies and removes the top n PCA components.

    Usage:
        pcr.py [-m] <matrixPath> <outPath> <threshold>

        <matrixPath> = path to matrix
        <outPath> = output path for space
        <threshold> = threshold, amount of PCA components

    Options:
        -m, --mean  flag, if mean centering should be applied

    ''')

    matrix_path = args['<matrixPath>']
    out_path = args['<outPath>']
    threshold = args['<threshold>']

    is_mean = args['--mean']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    try:
        space = Space(matrix_path, format='npz')
        _format_flag = 'npz'
    except ValueError:
        space = Space(matrix_path, format='w2v')
        _format_flag = 'w2v'

    # MC+PCR
    space.mc_pcr(int(threshold), is_mean)

    # Save the matrix
    space.save(out_path, format=_format_flag)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #5
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        LSC_W2V.py  <pathSentences1> <pathSentences2> <outPathVectors> <outPathLabels> <outPathResults> <sentenceType> <clusteringInitialization> <clustering> <limitAGL> <limitCOS> <limitCluster> <windowSize>
        LSC_W2V.py  <pathSentences1> <pathSentences2> <sentenceType> <clusteringInitialization> <clustering> <limitAGL> <limitCOS> <limitCluster> <windowSize>  
    
    Arguments:
       
        <pathSentences1> = Path to the test sentences from time1
        <pathSentences2> = Path to the test sentences from time2
        <outPathVectors> = Path to store the vectors
        <outPathLabels> = Path to store the clustering labels
        <outPathResults> = Path to store the lsc scores
        <sentenceType> = "lemma" or "token"
        <clusteringInitialization> = "gaac" for precalculated initializations, else random
        <clustering> = "kmeans" or "hierarchical"
        <limitAGL> = Change score limit for AGL to still be consiered as change (Good is about 0.2)
        <limitCOS> = Change score limit for Cosine to still be consiered as change (Good is about 0.02) 
        <limitCluster> = Minimum number of elements a cluster has to contain from one time and less from the other, to get assigned a change (Good is 5-10)
        <windowSize> = Window size for words to be in context of other words (Good is 20)
        


    """)

    pathSentences1 = args['<pathSentences1>']
    pathSentences2 = args['<pathSentences2>']
    outPathVectors = args['<outPathVectors>']
    outPathLabels = args['<outPathLabels>']
    clusteringInitialization = args['<clusteringInitialization>']
    clustering = args['<clustering>']
    pathResults = args['<outPathResults>']
    limitAGL = float(args['<limitAGL>'])
    limitCOS = float(args['<limitCOS>'])
    limitCluster = int(args['<limitCluster>'])
    windowSize = int(args['<windowSize>'])
    sentenceType = args['<sentenceType>']

    if len(sys.argv) == 10:
        outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz"
        outPathLabels = "Files/Clustering/cluster_labels.csv"
        pathResults = "Files/LSC/lsc_scores.csv"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.CRITICAL)
    print("")
    start_time = time.time()
    logging.critical("W2v LSC start")

    #Create the vectors of corpora 1
    logging.critical("Create the vectors of corpora 1")
    get_ipython().run_line_magic(
        'run',
        'WordSenseClustering/W2v.py $pathSentences1 $outPathVectors $windowSize $sentenceType'
    )

    inSpace = Space(path=outPathVectors)
    vectors1 = inSpace.matrix.toarray()

    #Createthe vectors of corpora 2
    logging.critical("Create the vectors of corpora 2")
    get_ipython().run_line_magic(
        'run',
        'WordSenseClustering/W2v.py $pathSentences2 $outPathVectors $windowSize $sentenceType'
    )
    inSpace = Space(path=outPathVectors)
    vectors2 = inSpace.matrix.toarray()

    #Create the lists to store the binary results in
    cosineDistanceBinary = []
    APDBinary = []
    clusterScoreBinary = []

    #Calculate cosineDistance for the two vectors
    cosineDistance = getCOS(vectors1, vectors2)
    if cosineDistance >= limitCOS:
        cosineDistanceBinary.append(1)
    else:
        cosineDistanceBinary.append(0)

    #Calculate Average pairwise distance for the two vectors
    APD = getAPD(vectors1, vectors2, 200)
    if APD >= limitAGL:
        APDBinary.append(1)
    else:
        APDBinary.append(0)

    #Create and cluster the combined vectors of both corpora
    logging.critical("Create and cluster the combined vectors of both corpora")
    vectors = np.concatenate((vectors1, vectors2), axis=0)
    outSpace = Space(matrix=vectors, rows=" ", columns=" ")
    outSpace.save(outPathVectors)
    #Cluster the combined vectors
    get_ipython().run_line_magic(
        'run',
        'WordSenseClustering/Clustering.py $outPathVectors 0 $outPathLabels 0 $clusteringInitialization 0 $clustering'
    )

    #Load list of labels
    labels = []
    with open(outPathLabels, 'r') as file:
        data = file.readlines()
    for i in data[-1]:
        if i != ",":
            if i != "\n":
                labels.append(int(i))

    # Calculated cluster LSC score
    labelA_1 = []
    labelA_2 = []

    maximum = len(vectors1)
    for i in range(0, len(vectors1)):
        labelA_1.append(labels[i])

    for i in range(maximum, maximum + len(vectors2)):
        labelA_2.append(labels[i])

    changeA = 0
    for j in set(labels):
        if labelA_1.count(j) >= limitCluster:
            if labelA_2.count(j) < limitCluster:
                changeA = 1
        if labelA_2.count(j) >= limitCluster:
            if labelA_1.count(j) < limitCluster:
                changeA = 1

    clusterScoreBinary.append(changeA)

    p = np.histogram(labelA_1)[0] / len(labelA_1)
    q = np.histogram(labelA_2)[0] / len(labelA_2)

    dist = distance.jensenshannon(p, q)

    filename1 = os.path.splitext(os.path.basename(pathSentences1))[0]
    filename2 = os.path.splitext(os.path.basename(pathSentences2))[0]

    cos = [filename1, filename2, "cosineDistance", cosineDistance]
    apd = [filename1, filename2, "APD", APD]
    cluster = [filename1, filename2, "clusterScore", dist]
    cosBin = [
        filename1, filename2, "cosineDistanceBinary", cosineDistanceBinary[0]
    ]
    APDBin = [filename1, filename2, "APDBinary", APDBinary[0]]
    clusterBin = [
        filename1, filename2, "clusterScoreBinary", clusterScoreBinary[0]
    ]

    print("Graded LSC:")
    print("")
    print("cosine distance:")
    print(cosineDistance)
    print("")
    print("Average pairwise distance:")
    print(APD)
    print("")
    print("JSD:")
    print(dist)
    print("")
    print("")
    print("Binary LSC:")
    print("")
    print("cosine distance binary:")
    print(cosineDistanceBinary[0])
    print("APD distance binary:")
    print(APDBinary[0])
    print("JSD binary:")
    print(clusterScoreBinary[0])

    with open(pathResults, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerows([cos, apd, cluster, cosBin, APDBin, clusterBin])

    logging.critical("W2v LSC end")
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")
Example #6
0
def main():
    """
    Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in:
       Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing.
    """

    # Get the arguments
    args = docopt(
        '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices.

    Usage:
        srv_align.py [-l] (-s <seeds> | -a) <matrixPath1> <matrixPath2> <outPath1> <outPath2> <outPathElement> <dim> <t>

        <seeds> = number of non-zero values in each random vector
        <matrixPath1> = path to matrix1
        <matrixPath2> = path to matrix2
        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <outPathElement> = output path for elemental space (context vectors)
        <dim> = number of dimensions for random vectors
        <t> = threshold for downsampling (if t=None, no subsampling is applied)

    Options:
        -l, --len   normalize final vectors to unit length
        -s, --see   specify number of seeds manually
        -a, --aut   calculate number of seeds automatically as proposed in [1,2]
  
    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')

    is_len = args['--len']
    is_seeds = args['--see']
    if is_seeds:
        seeds = int(args['<seeds>'])
    is_aut = args['--aut']
    matrixPath1 = args['<matrixPath1>']
    matrixPath2 = args['<matrixPath2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']
    outPathElement = args['<outPathElement>']
    dim = int(args['<dim>'])
    if args['<t>'] == 'None':
        t = None
    else:
        t = float(args['<t>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input matrices
    space1 = Space(matrixPath1)
    matrix1 = space1.matrix
    space2 = Space(matrixPath2)
    matrix2 = space2.matrix

    # Get mappings between rows/columns and words
    rows1 = space1.rows
    id2row1 = space1.id2row
    row2id1 = space1.row2id
    columns1 = space1.columns
    column2id1 = space1.column2id
    rows2 = space2.rows
    id2row2 = space2.id2row
    row2id2 = space2.row2id
    columns2 = space2.columns
    column2id2 = space2.column2id

    # Get union of rows and columns in both spaces
    unified_rows = sorted(list(set(rows1).union(rows2)))
    unified_columns = sorted(list(set(columns1).union(columns2)))
    columns_diff1 = sorted(list(set(unified_columns) - set(columns1)))
    columns_diff2 = sorted(list(set(unified_columns) - set(columns2)))

    # Get mappings of indices of columns in original spaces to indices of columns in unified space
    c2i = {w: i for i, w in enumerate(unified_columns)}
    cj2i1 = {j: c2i[w] for j, w in enumerate(columns1 + columns_diff1)}
    cj2i2 = {j: c2i[w] for j, w in enumerate(columns2 + columns_diff2)}

    if t != None:
        rows_diff1 = list(set(unified_rows) - set(rows1))
        rows_diff2 = list(set(unified_rows) - set(rows2))

        r2i = {w: i for i, w in enumerate(unified_rows)}
        rj2i1 = {j: r2i[w] for j, w in enumerate(rows1 + rows_diff1)}
        rj2i2 = {j: r2i[w] for j, w in enumerate(rows2 + rows_diff2)}

        # Build spaces with unified COLUMNS
        new_columns1 = csc_matrix(
            (len(rows1), len(columns_diff1)
             ))  # Get empty columns for additional context words
        unified_matrix1 = csc_matrix(hstack(
            (matrix1, new_columns1)
        ))[:, sorted(
            cj2i1, key=cj2i1.get
        )]  # First concatenate matrix and empty columns and then order columns according to unified_columns

        new_columns2 = csc_matrix((len(rows2), len(columns_diff2)))
        unified_matrix2 = csc_matrix(hstack(
            (matrix2, new_columns2)))[:, sorted(cj2i2, key=cj2i2.get)]

        # Build spaces with unified ROWS
        new_rows1 = csc_matrix((len(rows_diff1), len(unified_columns)))
        final_unified_matrix1 = csc_matrix(vstack(
            (unified_matrix1, new_rows1)))[sorted(rj2i1, key=rj2i1.get)]

        new_rows2 = csc_matrix((len(rows_diff2), len(unified_columns)))
        final_unified_matrix2 = csc_matrix(vstack(
            (unified_matrix2, new_rows2)))[sorted(rj2i2, key=rj2i2.get)]

        # Add up final unified matrices
        common_unified_matrix = np.add(final_unified_matrix1,
                                       final_unified_matrix2)

        # Get number of total occurrences of any word
        totalOcc = np.sum(common_unified_matrix)

        # Define function for downsampling
        downsample = lambda f: np.sqrt(float(t) / f) if f > t else 1.0
        downsample = np.vectorize(downsample)

        # Get total normalized co-occurrence frequency of all contexts in both spaces
        context_freqs = np.array(common_unified_matrix.sum(axis=0) /
                                 totalOcc)[0]

    ## Generate ternary random vectors
    if is_seeds:
        elementalMatrix = lil_matrix((len(unified_columns), dim))
        # Generate base vector for random vectors
        baseVector = np.zeros(
            dim
        )  # Note: Make sure that number of seeds is not greater than dimensions
        for i in range(0, int(seeds / 2)):
            baseVector[i] = 1.0
        for i in range(int(seeds / 2), seeds):
            baseVector[i] = -1.0
        for i in range(
                len(unified_columns)
        ):  # To-do: make this more efficient by generating random indices for a whole array
            np.random.shuffle(baseVector)
            elementalMatrix[i] = baseVector
    if is_aut:
        elementalMatrix = sparse_random_matrix(dim, len(unified_columns)).T

    # Initialize target vectors
    alignedMatrix1 = np.zeros((len(rows1), dim))
    alignedMatrix2 = np.zeros((len(rows2), dim))

    # Iterate over rows of space, find context words and update aligned matrix with low-dimensional random vectors of these context words
    for (matrix, id2row, cj2i,
         alignedMatrix) in [(matrix1, id2row1, cj2i1, alignedMatrix1),
                            (matrix2, id2row2, cj2i2, alignedMatrix2)]:
        # Iterate over targets
        for i in id2row:
            # Get co-occurrence values as matrix
            m = matrix[i]
            # Get nonzero indexes
            nonzeros = m.nonzero()
            nonzeros = [cj2i[j] for j in nonzeros[1]]
            data = m.data
            pos_context_vectors = elementalMatrix[nonzeros]
            if t != None:
                # Apply subsampling
                rfs = context_freqs[nonzeros]
                rfs = downsample(rfs)
                data *= rfs
            # Weight context vectors by occurrence frequency
            pos_context_vectors = pos_context_vectors.multiply(
                data.reshape(-1, 1))
            # Add up context vectors and store as row for target
            alignedMatrix[i] = np.sum(pos_context_vectors, axis=0)

    outSpace1 = Space(matrix=alignedMatrix1, rows=rows1, columns=[])
    outSpace2 = Space(matrix=alignedMatrix2, rows=rows2, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace1.l2_normalize()
        outSpace2.l2_normalize()

    # Save the matrices
    outSpace1.save(outPath1)
    outSpace2.save(outPath2)
    Space(matrix=elementalMatrix, rows=unified_columns,
          columns=[]).save(outPathElement)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #7
0
def main():
    """
    Create low-dimensional vector space by sparse random indexing from co-occurrence matrix.
    """

    # Get the arguments
    args = docopt('''Create low-dimensional vector space by sparse random indexing from co-occurrence matrix.

    Usage:
        ri.py [-l] (-s <seeds> | -a) <matrixPath> <outPath> <outPathElement> <dim> <t>

        <seeds> = number of non-zero values in each random vector
        <matrixPath> = path to matrix
        <outPath> = output path for reduced space 
        <outPathElement> = output path for elemental space (context vectors)
        <dim> = number of dimensions for random vectors
        <t> = threshold for downsampling (if t=None, no subsampling is applied)

    Options:
        -l, --len   normalize final vectors to unit length
        -s, --see   specify number of seeds manually
        -a, --aut   calculate number of seeds automatically as proposed in [1,2]

    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')
    
    is_len = args['--len']
    is_seeds = args['--see']
    if is_seeds:
        seeds = int(args['<seeds>'])
    is_aut = args['--aut']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    outPathElement = args['<outPathElement>']
    dim = int(args['<dim>'])
    if args['<t>']=='None':
        t = None
    else:
        t = float(args['<t>'])
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load input matrix
    space = Space(matrixPath)   
    matrix = space.matrix
    
    # Get mappings between rows/columns and words
    rows = space.rows
    id2row = space.id2row
    row2id = space.row2id
    columns = space.columns
    id2column = space.id2column
    column2id = space.column2id

    ## Generate ternary random vectors
    if is_seeds:
        elementalMatrix = lil_matrix((len(columns),dim))
        # Generate base vector for random vectors 
        baseVector = np.zeros(dim) # Note: Make sure that number of seeds is not greater than dimensions
        for i in range(0,int(seeds/2)):
            baseVector[i] = 1.0
        for i in range(int(seeds/2),seeds):
            baseVector[i] = -1.0
        for i in range(len(columns)):
            np.random.shuffle(baseVector)
            elementalMatrix[i] = baseVector
    if is_aut:
        elementalMatrix = sparse_random_matrix(dim,len(columns)).toarray().T

    elementalMatrix = csc_matrix(elementalMatrix)
    # to-do: get rid of transformation into sparse matrices by initializing them as such

    # Initialize target vectors
    reducedMatrix = np.zeros((len(rows),dim))    

    # Get number of total occurrences of any word
    totalOcc = np.sum(matrix)

    # Define function for downsampling
    downsample = lambda f: np.sqrt(float(t)/f) if f>t else 1.0
    downsample = np.vectorize(downsample)
    
    # Get total normalized co-occurrence frequency of all contexts in space
    context_freqs = np.array(matrix.sum(axis=0))/totalOcc
    
    #to-do: matrix multiplication is done row-wise, do this matrix-wise
    # Iterate over rows of space, find context words and update reduced matrix with low-dimensional random vectors of these context words
    for i in id2row:
        # Get co-occurrence values as matrix
        m = matrix[i]
        #print(m)
        # Get nonzero indexes and data
        nonzeros = m.nonzero()
        #print(nonzeros)        
        data = m.data            
        # Smooth context distribution
        pos_context_vectors = elementalMatrix[nonzeros[1]]
        if t!=None:
            # Apply subsampling
            rfs = context_freqs[0,nonzeros[1]]
            rfs = downsample(rfs)
            data *= rfs
        data = csc_matrix(data)
        # Weight context vectors by occurrence frequency
        pos_context_vectors = pos_context_vectors.multiply(data.reshape(-1,1))
        pos_context_vectors = np.sum(pos_context_vectors, axis=0)
        # Add up context vectors and store as row for target
        reducedMatrix[i] = pos_context_vectors
    
    outSpace = Space(matrix=reducedMatrix, rows=rows, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()
        
    # Save the matrices
    outSpace.save(outPath, format='w2v')
    Space(matrix=elementalMatrix, rows=columns, columns=[]).save(outPathElement)

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Example #8
0
def main():
    """
    Compute the smoothed and shifted PPMI matrix from a co-occurrence matrix. Smoothing is performed as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt('''Compute the smoothed and shifted PPMI matrix from a co-occurrence matrix and save it.

    Usage:
        ppmi.py [-l] <matrixPath> <outPath> <k> <alpha>

        <matrixPath> = path to matrix
        <outPath> = output path for space
        <k> = shifting parameter
        <alpha> = smoothing parameter

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    k = int(args['<k>'])
    alpha = float(args['<alpha>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load input matrix
    space = Space(matrixPath)   

    # Apply EPMI weighting
    space.epmi_weighting(alpha)
    
    # Apply log weighting
    space.log_weighting()

    # Shift values
    space.shifting(k)

    # Eliminate negative counts
    space.eliminate_negative()

    # Eliminate zero counts
    space.eliminate_zeros()
        
    outSpace = Space(matrix=space.matrix, rows=space.rows, columns=space.columns)

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()
        
    # Save the matrix
    outSpace.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Example #9
0
def main():
    """
    Make count-based vector space from corpus.
    """

    # Get the arguments
    args = docopt("""Make count-based vector space from corpus.

    Usage:
        count.py [-l] <corpDir> <outPath> <windowSize>
        
    Arguments:
       
        <corpDir> = path to corpus or corpus directory (iterates through files)
        <outPath> = output path for vectors
        <windowSize> = the linear distance of context words to consider in each direction

    Options:
        -l, --len   normalize final vectors to unit length

    """)

    is_len = args['--len']
    corpDir = args['<corpDir>']
    outPath = args['<outPath>']
    windowSize = int(args['<windowSize>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Build vocabulary
    logging.info("Building vocabulary")
    sentences = LineSentence(corpDir)
    # sentences = PathLineSentences(corpDir)
    vocabulary = sorted(
        list(
            set([
                word for sentence in sentences for word in sentence
                if len(sentence) > 1
            ])))  # Skip one-word sentences to avoid zero-vectors
    w2i = {w: i for i, w in enumerate(vocabulary)}

    # Initialize co-occurrence matrix as dictionary
    cooc_mat = defaultdict(lambda: 0)

    # Get counts from corpus
    sentences = PathLineSentences(corpDir)
    logging.info("Counting context words")
    for sentence in sentences:
        for i, word in enumerate(sentence):
            lowerWindowSize = max(i - windowSize, 0)
            upperWindowSize = min(i + windowSize, len(sentence))
            window = sentence[lowerWindowSize:i] + sentence[i +
                                                            1:upperWindowSize +
                                                            1]
            if len(window) == 0:  # Skip one-word sentences
                continue
            windex = w2i[word]
            for contextWord in window:
                cooc_mat[(windex, w2i[contextWord])] += 1

    # Convert dictionary to sparse matrix
    logging.info("Converting dictionary to matrix")
    cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)),
                                 dtype=float)
    try:
        cooc_mat_sparse.update(cooc_mat)
    except NotImplementedError:
        cooc_mat_sparse._update(cooc_mat)

    outSpace = Space(matrix=cooc_mat_sparse,
                     rows=vocabulary,
                     columns=vocabulary)

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()

    # Save the matrix
    outSpace.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #10
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        W2v.py  <pathTestSentences> <outPathVectors> <windowSize2> <sentenceType>
        W2v.py  <pathTestSentences> <windowSize2> <sentenceType>
        
    Arguments:
       
        <pathTestSentences> = Path to the test sentences
        <outPathVectors> = Path for storing the vectors 
        <windowSize2> = Window size (20 works good)
        <sentenceType> = "lemma" or "token"
    
    """)

    pathTestSentences = args['<pathTestSentences>']
    outPathVectors = args['<outPathVectors>']
    windowSize2 = int(args['<windowSize2>'])
    sentenceType = args['<sentenceType>']

    if len(sys.argv) == 4:
        outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.CRITICAL)
    print("")
    start_time = time.time()
    logging.critical("W2V start")

    if sentenceType == "token":
        sentType = "sentence_token"
    else:
        sentType = "sentence"

    if not isinstance(windowSize2, int):
        windowSize2 = 20

    #Load Word2Vec
    model = gensim.models.KeyedVectors.load_word2vec_format(
        'Data/GoogleNews-vectors-negative300.bin', binary=True)

    #Load TestSentences
    contextVectorList = []
    testSentences = []
    with open(pathTestSentences, 'r') as csvFile:
        reader = csv.DictReader(csvFile, delimiter="\t")
        for row in reader:
            testSentences.append(dict(row))

    #Calculate contextVectorMatrix
    logging.critical("Calculate contextVectorMatrix")

    nonExisting = False
    #self.target=str(testSentences[0]["original_word"])
    for dic in testSentences:
        sentence = dic[sentType].split()
        for i, word in enumerate(sentence):
            if str(i) == dic['target_index']:

                toMelt = []
                toMeltIDF = []
                lowerWindowSize = max(i - windowSize2, 0)
                upperWindowSize = min(i + windowSize2, len(sentence))
                window = sentence[lowerWindowSize:i] + sentence[
                    i + 1:upperWindowSize + 1]
                if word in model.wv.vocab:
                    for contextWord in window:
                        if contextWord in model.wv.vocab:
                            if contextWord != "$":
                                toMelt.append(
                                    preprocessing.normalize(
                                        [model.wv[contextWord]], norm='l2')[0])

                    contextVectorList.append(getContextVector(toMelt))
                else:
                    contextVectorList.append(np.zeros(300))

    #Normalize vectors in length
    contextVectorList = preprocessing.normalize(contextVectorList, norm='l2')

    #Save contextVectorList_sparse matrix
    outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ")
    outSpace.save(outPathVectors)

    logging.critical("W2V end")
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")
Example #11
0
def main():
    """
    Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt('''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format.

    Usage:
        svd.py [-l] <matrixPath> <outPath> <dim> <gamma>

        <matrixPath> = path to matrix
        <outPath> = output path for space
        <dim> = dimensionality of low-dimensional output vectors
        <gamma> = eigenvalue weighting parameter

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    dim = int(args['<dim>'])
    gamma = float(args['<gamma>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load input matrix
    space = Space(matrixPath)   
    matrix = space.matrix
    
    # Get mappings between rows/columns and words
    rows = space.rows
    id2row = space.id2row
    id2column = space.id2column

    # Apply SVD
    u, s, v = randomized_svd(matrix, n_components=dim, n_iter=5, transpose=False)

    # Weight matrix
    if gamma == 0.0:
        matrix_reduced = u
    elif gamma == 1.0:
        #matrix_reduced = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix)
        matrix_reduced = s * u
    else:
        #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula
        matrix_reduced = np.power(s, gamma) * u
       
    outSpace = Space(matrix=matrix_reduced, rows=rows, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()
        
    # Save the matrix
    outSpace.save(outPath, format='w2v')

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Example #12
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        Bert.py  <pathTestSentences> <outPathVectors> <vecType> 
        Bert.py  <pathTestSentences> <vecType>
        
    Arguments:
       
        <pathTestSentences> = Path to the test sentences
        <outPathVectors> = Path for storing the vectors
        <vecType> = "token" or "lemma"

    """)

    pathTestSentences = args['<pathTestSentences>']
    outPathVectors = args['<outPathVectors>']
    vecType = args['<vecType>']

    if len(sys.argv) == 3:
        outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.CRITICAL)
    print("")
    start_time = time.time()
    logging.critical("Bert start")

    #Load TestSentences
    # Load pre-trained model tokenizer (vocabulary)
    global tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # Load pre-trained model (weights)
    global model
    model = BertModel.from_pretrained('bert-base-uncased',
                                      output_hidden_states=True)

    contextVectorList = []
    testSentences = []
    with open(pathTestSentences, 'r') as csvFile:
        reader = csv.DictReader(csvFile, delimiter="\t")
        for row in reader:
            testSentences.append(dict(row))

        #Token vs. Lemma
        if vecType == "token":
            vecTypeString = "sentence_token"
        else:
            vecTypeString = "sentence"

        #Create the vectors
        logging.critical("Create Bert embeddings")
        for i in range(0, len(testSentences)):
            #Create target word(s)
            targetWord = str(testSentences[i][vecTypeString].split()[int(
                [testSentences[i]["target_index"]][0])])
            targetWords = []
            targetWords.append(tokenizer.tokenize(targetWord))
            targetWords = targetWords[0]

            #Tokenize text
            text = testSentences[i][vecTypeString]
            marked_text = "[CLS] " + text + " [SEP]"
            tokenized_text = tokenizer.tokenize(marked_text)

            #Search the indices of the tokenized target word in the tokenized text
            targetWordIndices = []

            for i in range(0, len(tokenized_text)):
                if tokenized_text[i] == targetWords[0]:
                    for l in range(0, len(targetWords)):
                        if tokenized_text[i + l] == targetWords[l]:
                            targetWordIndices.append(i + l)
                        if len(targetWordIndices) == len(targetWords):
                            break

            #Create BERT Token Embeddings
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            segments_ids = [1] * len(tokenized_text)
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])
            model.eval()
            with torch.no_grad():
                outputs = model(tokens_tensor, segments_tensors)
                hidden_states = outputs[2]
            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            token_embeddings = token_embeddings.permute(1, 0, 2)
            vectors = []
            for number in targetWordIndices:
                token = token_embeddings[number]
                sum_vec = np.sum([np.array(token[12]),
                                  np.array(token[1])],
                                 axis=0)
                vectors.append(np.array(sum_vec))
            contextVectorList.append(np.sum(vectors, axis=0, dtype=float))

    #Normalize vectors in length
    contextVectorList = preprocessing.normalize(contextVectorList, norm='l2')

    #Save contextVectorList_sparse matrix
    outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ")
    outSpace.save(outPathVectors)

    logging.critical("Bert end")
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")
Example #13
0
def main():
    """
    Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in:
       Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing.
    """

    # Get the arguments
    args = docopt(
        '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices.

    Usage:
        srv_align.py [-l] <matrixPath1> <matrixPath2> <outPath1> <outPath2> <dim>

        <matrixPath1> = path to matrix1
        <matrixPath2> = path to matrix2
        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <dim> = number of dimensions for random vectors

    Options:
        -l, --len   normalize final vectors to unit length

    Note:
        Assumes intersected and ordered columns. Paramaters -s, -a and <t> have been removed from an earlier version for efficiency. Also columns are now intersected instead of unified.
  
    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')

    is_len = args['--len']
    matrixPath1 = args['<matrixPath1>']
    matrixPath2 = args['<matrixPath2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']
    dim = int(args['<dim>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input matrices
    countSpace1 = Space(matrixPath1)
    countMatrix1 = countSpace1.matrix
    rows1 = countSpace1.rows
    columns1 = countSpace1.columns

    countSpace2 = Space(matrixPath2)
    countMatrix2 = countSpace2.matrix
    rows2 = countSpace2.rows
    columns2 = countSpace2.columns

    # Generate random vectors
    randomMatrix = csr_matrix(
        sparse_random_matrix(dim, len(columns1)).toarray().T)

    logging.info("Multiplying matrices")
    reducedMatrix1 = np.dot(countMatrix1, randomMatrix)
    reducedMatrix2 = np.dot(countMatrix2, randomMatrix)

    outSpace1 = Space(matrix=reducedMatrix1, rows=rows1, columns=[])
    outSpace2 = Space(matrix=reducedMatrix2, rows=rows2, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace1.l2_normalize()
        outSpace2.l2_normalize()

    # Save the matrices
    outSpace1.save(outPath1)
    outSpace2.save(outPath2)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #14
0
def main():
    """
    Make count-based vector space from corpus.
    """

    # Get the arguments
    args = docopt("""Make count-based vector space from corpus.

    Usage:
        count.py <corpDir> <vocabFile> <outPath> <windowSize>
               
        <corpDir> = path to corpus or corpus directory (iterates through files)
        <vocabFile> = row and column vocabulary
        <outPath> = output path for vectors
        <windowSize> = the linear distance of context words to consider in each direction
        
    Note:
        Skips one-word sentences to avoid zero-vectors. Does not increase window size when out-of-vocabulary words are found.

    """)

    corpDir = args['<corpDir>']
    vocabFile = args['<vocabFile>']
    outPath = args['<outPath>']
    windowSize = int(args['<windowSize>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load vocabulary
    logging.info("Loading vocabulary")
    with open(vocabFile, 'r', encoding='utf-8') as f_in:
        vocabulary = [line.strip() for line in f_in]

    w2i = {w: i for i, w in enumerate(vocabulary)}

    # Initialize co-occurrence matrix as dictionary
    cooc_mat = defaultdict(lambda: 0)

    # Get counts from corpus
    logging.info("Counting context words")
    sentences = PathLineSentences(corpDir)
    for sentence in sentences:
        for i, word in enumerate(sentence):
            try:
                windex = w2i[word]
            except KeyError:
                continue
            lowerWindowSize = max(i - windowSize, 0)
            upperWindowSize = min(i + windowSize, len(sentence))
            window = sentence[lowerWindowSize:i] + sentence[i +
                                                            1:upperWindowSize +
                                                            1]
            if len(window) == 0:  # Skip one-word sentences
                continue
            for contextWord in window:
                try:
                    cindex = w2i[contextWord]
                except KeyError:
                    continue
                cooc_mat[(windex, cindex)] += 1

    # Convert dictionary to sparse matrix
    logging.info("Converting dictionary to matrix")
    cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)),
                                 dtype=float)
    try:
        cooc_mat_sparse.update(cooc_mat)
    except NotImplementedError:
        cooc_mat_sparse._update(cooc_mat)

    outSpace = Space(matrix=cooc_mat_sparse,
                     rows=vocabulary,
                     columns=vocabulary)

    # Save the matrix
    outSpace.save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #15
0
def main():

    # Get the arguments
    args = docopt("""

    Usage:
        CountBasedVectors.py  <pathMatrix> <pathw2i> <pathCorpus> <pathTestSentences> <outPathVectors> <sentenceType> <windowSize2> 
        CountBasedVectors.py  <pathCorpus> <pathTestSentences> <sentenceType> <windowSize2>
        
    Arguments:
       
        <pathMatrix> = Path to the word vector matrix
        <pathw2i> = Path to the word-to-index
        <pathCorpus> = path to the corpus 
        <pathTestSentences> = Path to the test sentences
        <outPathVectors> = Path for storing the vectors
        <sentenceType> = "lemma" or "token"
        <windowSize2> = Window size (20 works fine)
        
        
    """)

    pathMatrix = args['<pathMatrix>']
    pathTestSentences = args['<pathTestSentences>']
    pathw2i = args['<pathw2i>']
    outPathVectors = args['<outPathVectors>']
    windowSize2 = int(args['<windowSize2>'])
    pathCorpus = args['<pathCorpus>']
    sentenceType = args['<sentenceType>']

    if len(sys.argv) == 5:
        pathMatrix = "Files/Vectors/FirstOrder/matrix.npz"
        pathw2i = "Files/Vectors/FirstOrder/w2i.npz.npy"
        outPathVectors = "Files/Vectors/SecondOrder/Vectors.npz"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.CRITICAL)
    print("")
    start_time = time.time()
    logging.critical("ContextVectors start")

    #Load w2i
    w2i = np.load(pathw2i, allow_pickle='TRUE').item()

    if sentenceType == "token":
        sentType = "sentence_token"
    else:
        sentType = "sentence"

    #Load saved wordVectorMatrix
    try:
        inSpace = Space(path=pathMatrix, format='w2v')
    except UnicodeDecodeError:
        inSpace = Space(path=pathMatrix)

    #inSpace =  Space(path=pathMatrix, format='w2v')
    #inSpace = Space(path=pathMatrix)
    cooc_mat_sparse = inSpace.matrix

    #Calculate IDF for every word
    docFreq = {}

    for i in range(0, len(w2i)):
        docFreq[i] = 0
    with gzip.open(pathCorpus, 'rt', encoding="utf-8") as sentences:
        count = 0
        try:
            for sentence in sentences:
                count = count + 1
                for word in set(sentence.split()):
                    if word in w2i:
                        docFreq[w2i[word]] += 1
        except:
            pass
        for key, value in w2i.items():
            docFreq[value] = math.log10(count / max(docFreq[value], 1))

    #Load TestSentences
    contextVectorList = []
    testSentences = []
    with open(pathTestSentences, 'r') as csvFile:
        reader = csv.DictReader(csvFile, delimiter="\t")
        for row in reader:
            testSentences.append(dict(row))

    #Calculate contextVectorMatrix
    logging.critical("Calculate contextVectorMatrix")
    nonExisting = False
    target = str(testSentences[0]["original_word"])
    for dic in testSentences:
        sentence = dic[sentType].split()
        for i, word in enumerate(sentence):
            if str(i) == dic['target_index'] and word == target:
                toMelt = []
                toMeltIDF = []
                lowerWindowSize = max(i - windowSize2, 0)
                upperWindowSize = min(i + windowSize2, len(sentence))
                window = sentence[lowerWindowSize:i] + sentence[
                    i + 1:upperWindowSize + 1]
                if word in w2i:
                    windex = w2i[word]
                    for contextWord in window:
                        if contextWord != "$":
                            if contextWord in w2i:
                                contextWordIndex = w2i[contextWord]
                                toMelt.append(
                                    cooc_mat_sparse[contextWordIndex].toarray(
                                    )[0] *
                                    math.pow(docFreq[contextWordIndex], 1))
                    contextVectorList.append(getContextVector(toMelt))
                else:
                    nonExisting = True

    #Normalize vectors in length
    contextVectorList = preprocessing.normalize(contextVectorList, norm='l2')

    #Save contextVectorList_sparse matrix
    outSpace = Space(matrix=contextVectorList, rows=" ", columns=" ")
    outSpace.save(outPathVectors)

    logging.critical("ContextVectors end")
    logging.critical("--- %s seconds ---" % (time.time() - start_time))
    print("")
Example #16
0
def main():
    """
    Create low-dimensional vector space by sparse random indexing from co-occurrence matrix.
    """

    # Get the arguments
    args = docopt(
        '''Create low-dimensional vector space by sparse random indexing from co-occurrence matrix.

    Usage:
        ri.py [-l] <matrixPath> <outPath> <dim>

        <matrixPath> = path to matrix
        <outPath> = output path for reduced space 
        <dim> = number of dimensions for random vectors

    Options:
        -l, --len   normalize final vectors to unit length

    Note:
        Paramaters -s, -a and <t> have been removed from an earlier version for efficiency.

    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')

    is_len = args['--len']
    matrixPath = args['<matrixPath>']
    outPath = args['<outPath>']
    dim = int(args['<dim>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input matrix
    countSpace = Space(matrixPath)
    countMatrix = countSpace.matrix
    rows = countSpace.rows
    columns = countSpace.columns

    # Generate random vectors
    randomMatrix = csr_matrix(
        sparse_random_matrix(dim, len(columns)).toarray().T)

    logging.info("Multiplying matrices")
    reducedMatrix = np.dot(countMatrix, randomMatrix)
    outSpace = Space(matrix=reducedMatrix, rows=rows, columns=[])

    if is_len:
        # L2-normalize vectors
        outSpace.l2_normalize()

    # Save the matrix
    outSpace.save(outPath, format='w2v')

    logging.info("--- %s seconds ---" % (time.time() - start_time))