Beispiel #1
0
def createModels():
    jsonhandler.loadTraining()
    for cand in candidates:
        models[cand] = Model(5, 256)
        print("creating model for " + cand)
        for doc in jsonhandler.trainings[cand]:
            models[cand].read(jsonhandler.getTrainingText(cand, doc))
            print(doc + " read")
Beispiel #2
0
def createModels():
	jsonhandler.loadTraining()
	for cand in candidates:
		models[cand] = Model(5, 256)
		print("creating model for "+cand)
		for doc in jsonhandler.trainings[cand]:
			models[cand].read(jsonhandler.getTrainingText(cand, doc))
			print(doc+" read")
		storeModel(models[cand], os.path.join(modeldir, cand))
		print("Model for "+cand+" saved")
Beispiel #3
0
def load_attribution_data(corpus_name):
    dataset = attribution_dataset_data_dir + '/' + corpus_name

    if not os.path.exists(os.path.join('corpora_texts', dataset)):
        if not os.path.exists('corpora_texts'):
            os.makedirs('corpora_texts')
        candidates = jsonhandler.candidates
        unknowns = jsonhandler.unknowns
        jsonhandler.loadJson(dataset)
        jsonhandler.loadTraining()
        corpus = []
        for author in candidates:
            for other_author in candidates:
                if author == other_author:
                    continue
                for unknown_text in jsonhandler.trainings[other_author]:
                    data_sample = []
                    known_documents = []
                    for known_document in jsonhandler.trainings[author]:
                        if known_document != unknown_text:
                            known_documents.append(
                                jsonhandler.getTrainingText(
                                    author, known_document))
                    data_sample.append(known_documents)
                    data_sample.append(
                        jsonhandler.getTrainingText(other_author,
                                                    unknown_text))
                    data_sample.append(False)
                    corpus.append(data_sample)
            for unknown in jsonhandler.trainings[author]:
                data_sample = []
                known_documents = []
                for known_document in jsonhandler.trainings[author]:
                    if unknown != known_document:
                        known_documents.append(
                            jsonhandler.getTrainingText(
                                author, known_document))
                data_sample.append(known_documents)
                data_sample.append(jsonhandler.getTrainingText(
                    author, unknown))
                data_sample.append(True)
                corpus.append(data_sample)
        # Another run of the program could have written the corpus
        if not os.path.exists(os.path.join('corpora_texts', dataset)):
            with open(os.path.join('corpora_texts', corpus_name),
                      'wb') as pickle_file:
                pickle.dump(corpus,
                            pickle_file,
                            protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(os.path.join('corpora_texts', corpus_name),
                  'rb') as pickle_file:
            corpus = pickle.load(pickle_file)
    return corpus
Beispiel #4
0
def tira(corpusdir, outputdir):
    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    stopwords = open("stopwords.txt")
    logger.info("Reads in stopwords")
    for line in stopwords:
        stopwords = line.split()
    authors = jsonhandler.candidates
    tests = jsonhandler.unknowns
    raw = {}
    raw_test = {}
    C = {}
    C_test = {}
    Gauthors = {}
    Gtests = {}
    for author in authors:
        logger.info("Reads in text " + str(author) + "...")
    #    raw[author] = open(author,encoding='iso-8859-1')
        for training in jsonhandler.trainings[author]:
            newtext = jsonhandler.getTrainingText(author, training)
            if author in raw.keys():
                if len(newtext) > len(raw[author]):
                    raw[author] = newtext
            else:
                raw[author] = newtext
        C[author] = prepareCorpus(raw[author])
        logger.info("Calculates Stopword Graph of " + str(author) + "...")
        Gauthors[author] = getStopWordGraph(C[author], stopwords)
    for author in tests:
        logger.info("Reads in test document " + str(author) + "...")
    #    raw[author] = open(author,encoding='iso-8859-1')
        raw_test[author] = jsonhandler.getUnknownText(author)
        C_test[author] = prepareCorpus(raw_test[author])
        logger.info("Calculates Stopword Graph " + str(author) + "...")
        Gtests[author] = getStopWordGraph(C_test[author], stopwords)
    results = []
    for testcase in tests:
        print(testcase)
        KL = {}
        Gtst = deepcopy(Gtests[testcase])
        Gtst = normalizeWeights(Gtst)
        for author in authors:
            logger.info("Calculates KL Divergence of " + str(author) + "...")
            KL[author] = authorKLdiv(Gauthors[author], Gtst)
        print(KL)
        # m = np.argmin(KL)
        m = min(KL, key=KL.get)
        results.append((testcase, m))
    texts = [text for (text, cand) in results]
    cands = [cand for (text, cand) in results]

    jsonhandler.storeJson(outputdir, texts, cands)
Beispiel #5
0
def tira(corpusdir, outputdir):
    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    stopwords = open("stopwords.txt")
    logger.info("Reads in stopwords")
    for line in stopwords:
        stopwords = line.split()
    authors = jsonhandler.candidates
    tests = jsonhandler.unknowns
    raw = {}
    raw_test = {}
    C = {}
    C_test = {}
    Gauthors = {}
    Gtests = {}
    for author in authors:
        logger.info("Reads in text " + str(author) + "...")
        #    raw[author] = open(author,encoding='iso-8859-1')
        for training in jsonhandler.trainings[author]:
            newtext = jsonhandler.getTrainingText(author, training)
            if author in raw.keys():
                if len(newtext) > len(raw[author]):
                    raw[author] = newtext
            else:
                raw[author] = newtext
        C[author] = prepareCorpus(raw[author])
        logger.info("Calculates Stopword Graph of " + str(author) + "...")
        Gauthors[author] = getStopWordGraph(C[author], stopwords)
    for author in tests:
        logger.info("Reads in test document " + str(author) + "...")
        #    raw[author] = open(author,encoding='iso-8859-1')
        raw_test[author] = jsonhandler.getUnknownText(author)
        C_test[author] = prepareCorpus(raw_test[author])
        logger.info("Calculates Stopword Graph " + str(author) + "...")
        Gtests[author] = getStopWordGraph(C_test[author], stopwords)
    results = []
    for testcase in tests:
        print(testcase)
        KL = {}
        Gtst = deepcopy(Gtests[testcase])
        Gtst = normalizeWeights(Gtst)
        for author in authors:
            logger.info("Calculates KL Divergence of " + str(author) + "...")
            KL[author] = authorKLdiv(Gauthors[author], Gtst)
        print(KL)
        # m = np.argmin(KL)
        m = min(KL, key=KL.get)
        results.append((testcase, m))
    texts = [text for (text, cand) in results]
    cands = [cand for (text, cand) in results]

    jsonhandler.storeJson(outputdir, texts, cands)
Beispiel #6
0
def fit_parameters():
    n_range = [3, 4, 5, 6]
    L_range = [500, 1000, 2000, 3000, 5000]
#    n_range = [2,3]
#    L_range = [20, 50, 100]
    jsonhandler.loadTraining()
    jsonhandler.loadGroundTruth()
    results = []
    for n in n_range:
        for L in L_range:
            logging.info("Test parameters: n=%d, l=%d", n, L)
            authors, scores = create_ranking(n, L)
            evaluation = evalTesting(jsonhandler.unknowns, authors)
            results.append((evaluation["accuracy"], n, L))
    return results
Beispiel #7
0
def fit_parameters():
    n_range = [3, 4, 5, 6]
    L_range = [500, 1000, 2000, 3000, 5000]
    #    n_range = [2,3]
    #    L_range = [20, 50, 100]
    jsonhandler.loadTraining()
    jsonhandler.loadGroundTruth()
    results = []
    for n in n_range:
        for L in L_range:
            logging.info("Test parameters: n=%d, l=%d", n, L)
            authors, scores = create_ranking(n, L)
            evaluation = evalTesting(jsonhandler.unknowns, authors)
            results.append((evaluation["accuracy"], n, L))
    return results
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(
        description='Tira submission for' +
        ' "Author identification using imbalanced and limited training texts."'
    )
    parser.add_argument('-i', action='store', help='Path to input directory')
    parser.add_argument('-o', action='store', help='Path to output directory')

    args = vars(parser.parse_args())

    corpusdir = args['i']
    outputdir = args['o']

    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    test_method(corpusdir, outputdir)
Beispiel #9
0
def main(corpusdir,
         outputdir,
         n_max_feature_number=1000,
         m_subspace_width=2,
         mySolver='svd',
         shrinkage=None):

    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    authors = jsonhandler.candidates
    tests = jsonhandler.unknowns

    encoding_setting = jsonhandler.encoding

    global trainSet, testSet
    texts = []
    text_authors = []
    test_texts = []
    ergebnisList = []
    prob_list = []
    for i in range(len(authors)):
        author = authors[i]
        for text in jsonhandler.trainings[author]:
            texts.append(jsonhandler.getTrainingText(author, text))
            text_authors.append(i)

    trainSet = getBunchOutRawTrain(texts, text_authors, authors)
    classifier_set = exhaustiv_disjoint_subspacing(trainSet,
                                                   n_max_feature_number,
                                                   m_subspace_width,
                                                   encoding_setting, mySolver,
                                                   shrinkage)
    for t_text in tests:
        # run classifier for every test text
        test_texts.append(jsonhandler.getUnknownText(t_text))

    proMatrx = mp(n_max_feature_number, m_subspace_width, classifier_set,
                  test_texts)
    ergebnis = argmax(proMatrx, 1)
    result_author_list = []
    for i in range(len(ergebnis)):
        result_author_list.append(authors[ergebnis[i]])
        prob_list.append(proMatrx[i][ergebnis[i]])

    jsonhandler.storeJson(outputdir, tests, result_author_list, prob_list)
def main():
    parser = argparse.ArgumentParser(description='Tira submission for' +
                                     ' "Author identification using imbalanced and limited training texts."')
    parser.add_argument('-i',
                        action='store',
                        help='Path to input directory')
    parser.add_argument('-o',
                        action='store',
                        help='Path to output directory')

    args = vars(parser.parse_args())

    corpusdir = args['i']
    outputdir = args['o']

    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()
    
    test_method(corpusdir, outputdir)
def tira(corpusdir, outputdir):
    """
    Keyword arguments:
    corpusdir -- Path to a tira corpus
    outputdir -- Output directory
    """
    candidates = jsonhandler.candidates
    unknowns = jsonhandler.unknowns
    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()
    authors = []
    scores = []

    for filename in unknowns:
        ranking = create_author_ranking(candidates, filename,
                                        method=dict_entropy)
#        ranking = create_simple_ranking(candidates, filename,
#                method=relative_zlib_entropy)
        author = ranking[0][0]
        score = 0.5
        authors.append(author)
        scores.append(score)
    jsonhandler.storeJson(outputdir, unknowns, authors, scores)
Beispiel #12
0
def tira(corpusdir, outputdir):
    """
    Keyword arguments:
    corpusdir -- Path to a tira corpus
    outputdir -- Output directory
    """
    candidates = jsonhandler.candidates
    unknowns = jsonhandler.unknowns
    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()
    authors = []
    scores = []

    for filename in unknowns:
        ranking = create_author_ranking(candidates, filename,
                                        method=dict_entropy)
#        ranking = create_simple_ranking(candidates, filename,
#                method=relative_zlib_entropy)
        author = ranking[0][0]
        score = 0.5
        authors.append(author)
        scores.append(score)
    jsonhandler.storeJson(outputdir, unknowns, authors, scores)
Beispiel #13
0
def tira(corpusdir, outputdir):
    """
    Keyword arguments:
    corpusdir -- Path to a tira corpus
    outputdir -- Output directory
    """
    pos_tag = False

    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    # creating training data
    logging.info("Load the training data...")
    database = Database(150, real_words=True)
    for candidate in jsonhandler.candidates:
        author = Author(candidate)
        for training in jsonhandler.trainings[candidate]:
            logging.info(
                "Author '%s': Loading training '%s'", candidate, training)
            text = Text(jsonhandler.getTrainingText(candidate, training),
                        candidate + " " + training,
                        pos_tag=pos_tag)
            author.add_text(text)
        database.add_author(author)
    database.process()

    for author in database.authors:
        author.calc_cmsz(database)

    # do some training, i.e. check which parameter is best
    logging.info("Start training...")
    results = {}
    for length in range(150, 301, 50):
        logging.info("Check for %s words.", length)
        database.considered_words = length
        database.process()
        results[length] = 0
        for correct_author in database.authors:
            for training in jsonhandler.trainings[correct_author.name]:
                trainingcase = Text(
                    jsonhandler.getTrainingText(correct_author.name, training),
                    "Trainingcase",
                    pos_tag=pos_tag)
                trainingcase.calc_zscores(database)
                deltas = {}
                for author in database.authors:
                    deltas[author] = trainingcase.calc_delta(database, author)
                if min(deltas, key=deltas.get) == correct_author:
                    results[length] += 1
    length = max(results, key=results.get)
    logging.info("Choose %s as length.", str(length))

    # reconfigure the database with length
    database.considered_words = length
    database.process()

    # run the testcases
    results = []
    for unknown in jsonhandler.unknowns:
        testcase = Text(jsonhandler.getUnknownText(unknown),
                        unknown,
                        pos_tag=pos_tag)
        testcase.calc_zscores(database)
        deltas = {}
        for author in database.authors:
            deltas[author] = testcase.calc_delta(database, author)
        results.append((unknown, min(deltas, key=deltas.get).name))

    texts = [text for (text, candidate) in results]
    cands = [candidate for (text, candidate) in results]
    jsonhandler.storeJson(outputdir, texts, cands)
Beispiel #14
0
def main():
    #
    parser = argparse.ArgumentParser(
        description="Tira submission for PPM approach (koppel11)")
    parser.add_argument("-i", action="store", help="path to corpus directory")
    parser.add_argument("-o", action="store", help="path to output directory")
    args = vars(parser.parse_args())

    corpusdir = args["i"]
    outputdir = args["o"]
    if corpusdir == None or outputdir == None:
        parser.print_help()
        return

    candidates = jsonhandler.candidates
    unknowns = jsonhandler.unknowns
    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    texts = {}
    # texts = frozenset() would this work??
    corpus = ""
    print("loading texts for training")
    deletes = []
    for cand in candidates:
        texts[cand] = ""
        for file in jsonhandler.trainings[cand]:
            texts[cand] += jsonhandler.getTrainingText(cand, file)
            # if frozenset() is used:
            # texts.add(jsonhandler.getTrainingText(cand, file))
            print("text " + file + " read")
        if len(texts[cand].split()) < mintrainlen:
            del texts[cand]
            deletes.append(cand)
        else:
            corpus += texts[cand]

    newcands = []
    for cand in candidates:
        if cand not in deletes:
            newcands.append(cand)
    candidates = newcands
    words = [len(texts[cand].split()) for cand in texts]
    minwords = min(words)
    print(minwords)

    fl = training(corpus)
    authors = []
    scores = []

    for file in unknowns:
        print("testing " + file)
        utext = jsonhandler.getUnknownText(file)
        ulen = len(utext.split())
        if ulen < minlen:
            authors.append("None")
            scores.append(0)
        else:
            wins = [0] * len(candidates)
            textlen = min(ulen, minwords)
            print(textlen)
            ustring = "".join(utext.split()[:textlen])
            for i in range(repetitions):
                rfl = random.sample(fl, len(fl) // 2)
                sims = []
                for cand in candidates:
                    candstring = getRandomString(texts[cand], textlen)
                    sims.append(testSim(candstring, ustring, rfl, 1))
                wins[sims.index(max(sims))] += 1
            score = max(wins) / float(repetitions)
            if score >= threshold:
                authors.append(candidates[wins.index(max(wins))])
                scores.append(score)
            else:
                authors.append("None")
                scores.append(score)

    print("storing answers")
    jsonhandler.storeJson(outputdir, unknowns, authors, scores)
Beispiel #15
0
def tira(corpusdir, outputdir):
    # load training data
    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    database = Database()

    for candidate in jsonhandler.candidates:
        database.add_author(candidate)
        for training in jsonhandler.trainings[candidate]:
            logging.info(
                "Reading training text '%s' of '%s'", training, candidate)
            text = Text(jsonhandler.getTrainingText(candidate, training),
                        candidate + " " + training)
            try:
                text.create_chunks()
                database.add_text(candidate, text)
            except:
                # logging.info("Text size too small. Skip this text.")
                logging.warning("Text too small. Exit.")
                sys.exit()

    database.calc_initial_feature_set()

    candidates = []  # this list shall contain the most likely candidates

    # We use the unmasking procedure to compare all unknown texts to all
    # enumerated texts of known authorship and then decide which fit best.
    # runtime could surely be optimized
    for unknown in jsonhandler.unknowns:
        try:
            results = {}
                # dictionary containing the maximum difference (first and
                         # last iteration) for every author

            # load the unknown text and create the chunks which are used
            # for the unmasking process
            unknown_text = Text(jsonhandler.getUnknownText(unknown), unknown)
            unknown_text.create_chunks()

            for candidate in jsonhandler.candidates:
                results[candidate] = float("inf")

                for known_text in database.texts[candidate]:
                    # reset the feature list, i.e. create a copy of the initial
                    # list
                    features = list(database.initial_feature_set)

                    # randomly select equally many chunks from each text
                    select_chunks(unknown_text, known_text)

                    # create label vector
                    # (0 -> chunks of unknown texts, 1 -> chunks of known texts)
                    label = [0 for i in range(0, len(unknown_text.selected_chunks))] + \
                            [1 for i in range(
                                0, len(known_text.selected_chunks))]
                    label = numpy.array(label)
                    # the reshape is necessary for the classifier
                    label.reshape(
                        len(unknown_text.selected_chunks) + len(known_text.selected_chunks), 1)

                    # loop
                    global NUMBER_ITERATIONS
                    global NUMBER_ELIMINATE_FEATURES
                    scores = []
                    for i in range(0, NUMBER_ITERATIONS):
                        logging.info("Iteration #%s for texts '%s' and '%s'",
                                     str(i + 1), unknown, known_text.name)
                        # Create the matrix containing the relative word counts
                        # in each chunk (for the selected features)
                        matrix = [[chunk.count(word) / CHUNK_LENGTH
                                   for word in features]
                                  for chunk
                                  in (unknown_text.selected_chunks + known_text.selected_chunks)]
                        matrix = numpy.array(matrix)

                        # Get a LinearSVC classifier and its score (i.e. accuracy
                        # in the training data). Save this score as a point in the
                        # scores curve. (We want to select the curve with the
                        # steepest decrease)
                        classifier = svm.LinearSVC()
                        classifier.fit(matrix, label)
                        scores.append(classifier.score(matrix, label))

                        # a list of all feature weights
                        flist = classifier.coef_[0]

                        # Now, we have to delete the strongest weighted features
                        # (NUMBER_ELIMINATE_FEATURES) from each side.
                        # indices of maximum 3 values and minimum 3 values
                        delete = list(numpy.argsort(flist)[-NUMBER_ELIMINATE_FEATURES:]) \
                            + list(numpy.argsort(flist)[
                                   :NUMBER_ELIMINATE_FEATURES])

                        # We cannot directly use the delete list to eliminate from
                        # the features list since peu-a-peu elimination changes
                        # the indices.
                        delete_features = []
                        for i in delete:
                            delete_features.append(features[i])

                        logging.info("Delete %s", str(delete_features))

                        for feature in delete_features:
                            # a single feature could appear twice in the delete
                            # list
                            if feature in features:
                                features.remove(feature)

                    # The scores list is now the graph we use to get our results
                    # Therefore, compare with previous scores.
                    score = curve_score(scores)
                    logging.info("Calculated a score of %s", str(score))
                    if score < results[candidate]:
                        results[candidate] = score

            # Which author has the biggest score?
            most_likely_author = min(results, key=results.get)
            logging.info("Most likely author is '%s' with a score of %s",
                         most_likely_author, results[most_likely_author])
            candidates.append(most_likely_author)
        except:
            candidates.append("FILE_TO_SMALL")

    # save everything in the specified directory
    jsonhandler.storeJson(outputdir, jsonhandler.unknowns, candidates)
Beispiel #16
0
def tira(corpusdir, outputdir):
    """
    Keyword arguments:
    corpusdir -- Path to a tira corpus
    outputdir -- Output directory
    """
    pos_tag = False

    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    # creating training data
    logging.info("Load the training data...")
    database = Database(150, real_words=True)
    for candidate in jsonhandler.candidates:
        author = Author(candidate)
        for training in jsonhandler.trainings[candidate]:
            logging.info("Author '%s': Loading training '%s'", candidate,
                         training)
            text = Text(jsonhandler.getTrainingText(candidate, training),
                        candidate + " " + training,
                        pos_tag=pos_tag)
            author.add_text(text)
        database.add_author(author)
    database.process()

    for author in database.authors:
        author.calc_cmsz(database)

    # do some training, i.e. check which parameter is best
    logging.info("Start training...")
    results = {}
    for length in range(150, 301, 50):
        logging.info("Check for %s words.", length)
        database.considered_words = length
        database.process()
        results[length] = 0
        for correct_author in database.authors:
            for training in jsonhandler.trainings[correct_author.name]:
                trainingcase = Text(jsonhandler.getTrainingText(
                    correct_author.name, training),
                                    "Trainingcase",
                                    pos_tag=pos_tag)
                trainingcase.calc_zscores(database)
                deltas = {}
                for author in database.authors:
                    deltas[author] = trainingcase.calc_delta(database, author)
                if min(deltas, key=deltas.get) == correct_author:
                    results[length] += 1
    length = max(results, key=results.get)
    logging.info("Choose %s as length.", str(length))

    # reconfigure the database with length
    database.considered_words = length
    database.process()

    # run the testcases
    results = []
    for unknown in jsonhandler.unknowns:
        testcase = Text(jsonhandler.getUnknownText(unknown),
                        unknown,
                        pos_tag=pos_tag)
        testcase.calc_zscores(database)
        deltas = {}
        for author in database.authors:
            deltas[author] = testcase.calc_delta(database, author)
        results.append((unknown, min(deltas, key=deltas.get).name))

    texts = [text for (text, candidate) in results]
    cands = [candidate for (text, candidate) in results]
    jsonhandler.storeJson(outputdir, texts, cands)
Beispiel #17
0
def tira(corpusdir, outputdir):
    # load training data
    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    database = Database()

    for candidate in jsonhandler.candidates:
        database.add_author(candidate)
        for training in jsonhandler.trainings[candidate]:
            logging.info("Reading training text '%s' of '%s'", training,
                         candidate)
            text = Text(jsonhandler.getTrainingText(candidate, training),
                        candidate + " " + training)
            try:
                text.create_chunks()
                database.add_text(candidate, text)
            except:
                # logging.info("Text size too small. Skip this text.")
                logging.warning("Text too small. Exit.")
                sys.exit()

    database.calc_initial_feature_set()

    candidates = []  # this list shall contain the most likely candidates

    # We use the unmasking procedure to compare all unknown texts to all
    # enumerated texts of known authorship and then decide which fit best.
    # runtime could surely be optimized
    for unknown in jsonhandler.unknowns:
        try:
            results = {}
            # dictionary containing the maximum difference (first and
            # last iteration) for every author

            # load the unknown text and create the chunks which are used
            # for the unmasking process
            unknown_text = Text(jsonhandler.getUnknownText(unknown), unknown)
            unknown_text.create_chunks()

            for candidate in jsonhandler.candidates:
                results[candidate] = float("inf")

                for known_text in database.texts[candidate]:
                    # reset the feature list, i.e. create a copy of the initial
                    # list
                    features = list(database.initial_feature_set)

                    # randomly select equally many chunks from each text
                    select_chunks(unknown_text, known_text)

                    # create label vector
                    # (0 -> chunks of unknown texts, 1 -> chunks of known texts)
                    label = [0 for i in range(0, len(unknown_text.selected_chunks))] + \
                            [1 for i in range(
                                0, len(known_text.selected_chunks))]
                    label = numpy.array(label)
                    # the reshape is necessary for the classifier
                    label.reshape(
                        len(unknown_text.selected_chunks) +
                        len(known_text.selected_chunks), 1)

                    # loop
                    global NUMBER_ITERATIONS
                    global NUMBER_ELIMINATE_FEATURES
                    scores = []
                    for i in range(0, NUMBER_ITERATIONS):
                        logging.info("Iteration #%s for texts '%s' and '%s'",
                                     str(i + 1), unknown, known_text.name)
                        # Create the matrix containing the relative word counts
                        # in each chunk (for the selected features)
                        matrix = [[
                            chunk.count(word) / CHUNK_LENGTH
                            for word in features
                        ] for chunk in (unknown_text.selected_chunks +
                                        known_text.selected_chunks)]
                        matrix = numpy.array(matrix)

                        # Get a LinearSVC classifier and its score (i.e. accuracy
                        # in the training data). Save this score as a point in the
                        # scores curve. (We want to select the curve with the
                        # steepest decrease)
                        classifier = svm.LinearSVC()
                        classifier.fit(matrix, label)
                        scores.append(classifier.score(matrix, label))

                        # a list of all feature weights
                        flist = classifier.coef_[0]

                        # Now, we have to delete the strongest weighted features
                        # (NUMBER_ELIMINATE_FEATURES) from each side.
                        # indices of maximum 3 values and minimum 3 values
                        delete = list(numpy.argsort(flist)[-NUMBER_ELIMINATE_FEATURES:]) \
                            + list(numpy.argsort(flist)[
                                   :NUMBER_ELIMINATE_FEATURES])

                        # We cannot directly use the delete list to eliminate from
                        # the features list since peu-a-peu elimination changes
                        # the indices.
                        delete_features = []
                        for i in delete:
                            delete_features.append(features[i])

                        logging.info("Delete %s", str(delete_features))

                        for feature in delete_features:
                            # a single feature could appear twice in the delete
                            # list
                            if feature in features:
                                features.remove(feature)

                    # The scores list is now the graph we use to get our results
                    # Therefore, compare with previous scores.
                    score = curve_score(scores)
                    logging.info("Calculated a score of %s", str(score))
                    if score < results[candidate]:
                        results[candidate] = score

            # Which author has the biggest score?
            most_likely_author = min(results, key=results.get)
            logging.info("Most likely author is '%s' with a score of %s",
                         most_likely_author, results[most_likely_author])
            candidates.append(most_likely_author)
        except:
            candidates.append("FILE_TO_SMALL")

    # save everything in the specified directory
    jsonhandler.storeJson(outputdir, jsonhandler.unknowns, candidates)
Beispiel #18
0
def heavy_lifter(corpusdir, outputdir):
    # load training data
    jsonhandler.loadJson(corpusdir)
    # 现在吃进meta-file.json: corpusdir, upath, candidates, unknowns, encoding, language
    # corpusdir是meta-file.json所在文件夹 ,upath是"unknown"文件夹的位置,也就是放unknown texts的地方
    # unknowns是unknowns-texts的文件名,ie. "unknown00011.txt"
    jsonhandler.loadTraining()
    # 现在trainings(dict)有内容了,ie:{...'candidate00010': ['known00001.txt', 'known00002.txt']}
    database = Database(
    )  # database.authors = [], database.texts = {}, database.features = {}

    for candidate in jsonhandler.candidates:
        database.add_author(
            candidate
        )  # database.authors = [cand1, cand2...]// database.texts = {author:[]...}
        for training in jsonhandler.trainings[
                candidate]:  # 对于某个candidate的training text的文件名
            # logging.info("Reading training text '%s' of '%s'", training, candidate)
            text = Text(
                jsonhandler.getTrainingText(
                    candidate, training),  # 创建Text实例 text(raw, name)
                candidate + " " +
                training)  # name类似于 "candidate00001 known00002.txt"
            try:
                text.chunks_generator(
                )  # Text.chunks是一个list of lists(由许多chunks组成,每个chunk里是tokens列表)
                text.ngrams_generator()  # 生成text实例的ngram
                database.add_text(
                    candidate, text
                )  # 为database实例加上texts属性 {candidate00001: [text, text,...]}

            except:
                # logging.info("Text size too small. Skip this text.")
                # logging.warning("Text too small. Exit.")
                # sys.exit()
                print('Something went wrong.')
    database.feature_generator()

    # We use the unmasking procedure to compare all unknown texts to all
    # enumerated texts of known authorship and then decide which fit best.
    for unknown in jsonhandler.unknowns:
        results = {
        }  # dictionary containing the average ten folds accuracy score of each candidates
        dropped_features = {}
        # load the unknown text and create the chunks which are used for the unmasking process
        unknown_text = Text(jsonhandler.getUnknownText(unknown), unknown)
        unknown_text.chunks_generator(
        )  # 得到unknown_text.tokens (list) 和 .chunks (list of lists)

        for candidate in jsonhandler.candidates:
            # print(candidate)
            # results是总的dict,每个key所对应的value是个list,包含了5个子list
            # 实验要进行5轮,每轮都是10折
            results[candidate] = []
            dropped_features[candidate] = []
            # features = (database.features[candidate]).copy()
            for known_text in database.texts[candidate]:
                # experiment_scores = []
                # randomly select equally many chunks from each text
                # 重复5次实验,每次都随机删掉多余的chunk
                for experiment in range(0, NUMBER_EXPERIMENTS):
                    features = (database.features[candidate]).copy()
                    select_chunks(unknown_text, known_text)
                    # (dropped_features[candidate])[experiment] = []
                    # create label vector
                    # (0 for chunks of unknown texts, 1 for chunks of known texts)
                    label = [
                        0 for i in range(0, len(unknown_text.selected_chunks))
                    ] + [1 for i in range(0, len(known_text.selected_chunks))]
                    label = np.array(label)
                    # the reshape is necessary for the classifier
                    label.reshape(
                        len(unknown_text.selected_chunks) +
                        len(known_text.selected_chunks), 1)

                    global NUMBER_ITERATIONS, FOLD_ELIMINATION
                    fold_scores = []

                    for i in range(0, NUMBER_ITERATIONS):
                        # logging.info("Iteration #%s for texts '%s' and '%s'",
                        #              str(i + 1), unknown, known_text.name)
                        # (dropped_features[candidate])[i] = []
                        matrix = [[
                            chunk[0].count(token) / len(chunk2tokens(chunk[0]))
                            for token in features
                        ] for chunk in (unknown_text.selected_chunks +
                                        known_text.selected_chunks)]
                        matrix = np.array(matrix)
                        print(
                            "进行", candidate, "的第" + str(experiment + 1) +
                            "个实验的第" + str(i + 1) + "轮。", '\n')

                        classifier = svm.LinearSVC()
                        # classifier = svm.SVC(kernel='linear')
                        classifier.fit(matrix, label)
                        fold_scores.append(classifier.score(matrix, label))

                        weights = classifier.coef_[0]

                        delete = list(np.argsort(weights)[-FOLD_ELIMINATION:]) \
                                       + list(np.argsort(weights)[:FOLD_ELIMINATION])

                        # We cannot directly use the delete list to eliminate from
                        # the features list since peu-a-peu elimination changes the indices.
                        delete_features = []
                        for x in delete:
                            delete_features.append(features[x])
                        # ((dropped_features[candidate])[experiment]).append(delete_features)
                        (dropped_features[candidate]).append(delete_features)
                        # logging.info("Delete %s", str(delete_features))

                        for feature in delete_features:
                            if feature in features:
                                features.remove(feature)
                    # experiment_scores.append(fold_scores)
                    #
                    # experiment_scores = np.array(experiment_scores)
                    # experiment_scores = np.mean(experiment_scores, axis=0)
                    (results[candidate]).append(fold_scores)

    # save everything in the specified directory
    jsonhandler.storeJson(outputdir, results, dropped_features)
Beispiel #19
0
def main():
    """The main function."""
    parser = argparse.ArgumentParser(
        description="PPM approach according to Koppel11"
    )

    parser.add_argument("-i", action="store", help="path to corpus directory")
    parser.add_argument("-o", action="store", help="path to output directory")

    args = vars(parser.parse_args())

    corpusdir = args["i"]
    outputdir = args["o"]

    if corpusdir is None or outputdir is None:
        parser.print_help()
        return

    candidates = jsonhandler.candidates
    unknowns = jsonhandler.unknowns
    jsonhandler.loadJson(corpusdir)
    jsonhandler.loadTraining()

    texts = {}
    corpus = ""
    print("Loading texts for training...")
    deletes = []

    for cand in candidates:
        texts[cand] = ""
        for file in jsonhandler.trainings[cand]:
            texts[cand] += jsonhandler.getTrainingText(cand, file)
            print(f"Text {file} read")

        if len(texts[cand].split()) < MINTRAINLEN:
            del texts[cand]
            deletes.append(cand)
        else:
            corpus += texts[cand]

    newcands = []
    for cand in candidates:
        if cand not in deletes:
            newcands.append(cand)

    candidates = newcands
    words = [len(texts[cand].split()) for cand in texts]
    minwords = min(words)
    print(minwords)

    feature_list = training(corpus)
    authors = []
    scores = []

    for file in unknowns:
        print(f"Testing {file}")
        utext = jsonhandler.getUnknownText(file)
        ulen = len(utext.split())

        if ulen < MINLEN:
            authors.append("None")
            scores.append(0)
        else:
            wins = [0] * len(candidates)
            textlen = min(ulen, minwords)
            print(textlen)
            ustring = "".join(utext.split()[:textlen])

            for _ in range(REPETITIONS):
                rfl = random.sample(feature_list, len(feature_list) // 2)
                sims = []
                for cand in candidates:
                    candstring = get_random_string(texts[cand], textlen)
                    sims.append(test_sim(candstring, ustring, rfl, 1))
                wins[sims.index(max(sims))] += 1

            score = max(wins) / float(REPETITIONS)

            if score >= THRESHOLD:
                authors.append(candidates[wins.index(max(wins))])
                scores.append(score)
            else:
                authors.append("None")
                scores.append(score)

    print("Storing answers...")
    jsonhandler.storeJson(outputdir, unknowns, authors, scores)
    print("Done!")