コード例 #1
0
ファイル: tests.py プロジェクト: sinyovercosy/pagerank
def main():
    links = [[1,2,3],[3],[0,3],[0,2]]
    result = rank(links)
    print("testing simple")
    print("INPUT:")
    print(links)
    print("OUTPUT:")
    print(result)
    assert (result == [3, 0, 2, 1])
    print("passed!")

    links = [[1,2,3],[3],[0,3],[]]
    result = rank(links)
    print("testing dangling")
    print("INPUT:")
    print(links)
    print("OUTPUT:")
    print(result)
    assert (result == [3, 0, 2, 1])
    print("passed!")

    links = [[1,2,3], [3,4], [0,3], [1,6], [6], [4,7], [5], [5,6]]
    result = rank(links)
    print("testing reducible")
    print("INPUT:")
    print(links)
    print("OUTPUT:")
    print(result)
    assert (result == [5, 6, 4, 7, 3, 1, 0, 2])
    print("passed!")

    links = [[1, 5], [2, 5], [1, 3, 5], [4], [1, 5], [2, 6], [0, 1]]
    result = rank(links)
    print("testing given sample 1")
    print("INPUT:")
    print(links)
    print("OUTPUT:")
    print(result)
    assert (result == [5, 2, 1, 6, 3, 4, 0] or result == [5, 2, 1, 6, 4, 3, 0])
    print("passed!")

    links = [[1,3,4],[0,2,4],[3,6],[2,4,6],[5,8],[4,6,8],[0,7,9],[0,6,8],[2,9],[0,2,8]]
    result = rank(links)
    print("testing given sample 2")
    print("INPUT:")
    print(links)
    print("OUTPUT:")
    print(result)
    assert (result == [2,6,8,0,3,9,4,5,7,1] or result == [2,6,8,0,4,3,9,5,7,1])
    print("passed!")
コード例 #2
0
def getRankings(week, model, teamToIndex, week_multiplier=None):

    A = np.array(getMatrixForSeason(week, model, week_multiplier))
    R = pagerank.rank(A)

    rankings = [(i, R[i]) for i in range(len(indexToTeam))]

    indexToExpectedPoints, indexToRandomPoints, indexToMorePoints = getSeasonStats(
        R, week, model[0], teamToIndex)

    return indexToExpectedPoints
コード例 #3
0
    def f(collected_data, week, canAddInput, homeIndex, awayIndex, homeScore,
          awayScore):

        if not (canAddInput):
            return

        # Also happens in createA
        # Could happen here depends on when canAddInput
        if var not in collected_data:
            R = [[1.0 / collected_data['team_count']]
                 for i in range(collected_data['team_count'])]
        else:
            A = collected_data[var]
            R = pagerank.rank(A)

        collected_data["inputs"][-1].append(R[homeIndex][0])
        collected_data["inputs"][-1].append(R[awayIndex][0])
コード例 #4
0
def main(csv_file_name='epl-2021.csv',
         output_file_name='epl-predictions-stats.json'):

    model = epl.getData(csv_file_name)
    data, indexToTeam, teamToIndex, indexToGamesPlayed = model
    predictor = predictions_tensorflow.createPredictGameFunction(csv_file_name)
    indexToPlaceFinishedToTimesFinished, indexToMeanPoints, indexToTeam = probs.calculateProbs(
        10**5, csv_file_name, predictor)

    week = 40

    A = np.array(epl.getMatrixForSeason(week, model, None))
    R = pagerank.rank(A)

    rankings = [(indexToMeanPoints[i], i)
                for i in range(len(indexToMeanPoints))]
    rankings.sort(reverse=True)

    jsonData = []
    for value in rankings:
        probabilties = indexToPlaceFinishedToTimesFinished[value[1]]
        jsonData.append({
            "name":
            indexToTeam[value[1]],
            "probability":
            list(probabilties),
            "expected":
            value[0],
            "championslegue":
            probabilties[0] + probabilties[1] + probabilties[2] +
            probabilties[3],
            "relegated":
            probabilties[-1] + probabilties[-2] + probabilties[-3],
            "pagerank":
            R[value[1]][0],
        })

    games_data = probs.calculateProbsOfEachGameInASeason(
        csv_file_name, predictor)

    with open(output_file_name, 'w') as out:
        out.write(json.dumps({'teams': list(jsonData), 'games': games_data}))
コード例 #5
0
from utils import parse
import pagerank

graph = parse("graph.txt")
pagerank.rank(graph)
コード例 #6
0
if __name__ == "__main__":

    csvFileName = "epl-2021.csv"
    week = 40
    if len(sys.argv) > 1:
        csvFileName = sys.argv[1]
    if len(sys.argv) > 2:
        week = int(sys.argv[2])

    model = getData(csvFileName)
    data, indexToTeam, teamToIndex, indexToGamesPlayed = model

    # Get matrix A and Ranking
    A = getMatrixForSeason(week, model, None)  #createWeekMultiplier(8))
    R = pagerank.rank(A)

    rankings = [(i, R[i]) for i in range(len(indexToTeam))]

    printRankings(rankings, model)

    indexToExpectedPoints, indexToRandomPoints, indexToMorePoints = getSeasonStats(
        R, week, data, teamToIndex)

    expectedPointsRanking = [(i, indexToExpectedPoints[i])
                             for i in range(len(indexToExpectedPoints))]
    randomPointsRanking = [(i, indexToRandomPoints[i])
                           for i in range(len(indexToRandomPoints))]
    morePointsRanking = [(i, indexToMorePoints[i])
                         for i in range(len(indexToMorePoints))]
コード例 #7
0
def select_top(dataset = 'training'):
    ExcludeSentencesWithNoNamedEntities = True
    ner_tagger = loadStanfordNERTagger()
    meta_regex = re.compile(r'^([A-Z]{2,}.{,25}\(.{,25}\))|^([A-Z\s]{2,}(\_|\-))')
    ranked_sentences = pagerank.rank(dataset)
    input_directoryPath = os.path.join('outputs/pagerank_D4', dataset)
    output_directoryPath = os.path.join('outputs/reranker_D4', dataset)
    top_sents = {}
    for topic_id in ranked_sentences.keys():
        sentences = ranked_sentences[topic_id]
        id_part1 = topic_id[:-1]
        id_part2 = topic_id[-1:]
        output_file_name = id_part1 + "-A.M.100." + id_part2 + ".1"
        output_file_path = os.path.join(output_directoryPath, output_file_name)

        vocab = set()
        for sentence in sentences:
            original = sentence.original_sent
            match = re.search(meta_regex, original)
            clean = re.sub(meta_regex, '', original).replace('--', '').lower()
            sentence.original_sent = re.sub(meta_regex, '', original).replace('--', '')
            sentence.original_sent = compress(sentence.original_sent)
            sentence.clean_sent = clean
            splitted = clean.split()
            for word in splitted:
                vocab.add(word.lower())

        vocab_sentences_count = {}
        for word in vocab:
            vocab_sentences_count[word] = 0

        for sentence in sentences:
            unique_terms = set()
            for word in sentence.clean_sent.split():
                unique_terms.add(word)

            for word in unique_terms:
                vocab_sentences_count[word] = vocab_sentences_count[word] + 1

        idf = {}
        for word in vocab:
            idf[word] = 1.0 + math.log(float(len(sentences) / float(vocab_sentences_count[word])))

        chosen_sentences = []
        total_word_count = 0
        for sent_obj in sentences:
            sentence = sent_obj.clean_sent

            if total_word_count + len(sentence.split()) > 100:
                continue

            # decide whether or not to include the sentence based on the cosine similarity
            include_sentence = True
            for chosen_sentence_obj in chosen_sentences:
                chosen_sentence = chosen_sentence_obj.clean_sent
                cosine_score = cosine(sentence.lower(), chosen_sentence.lower(), idf)
                if cosine_score > THRESHOLD:
                    include_sentence = False
                    break

            if include_sentence:
                # exclude sentences with no named entities
                if ExcludeSentencesWithNoNamedEntities and not hasNamedEntities(ner_tagger, sent_obj.original_sent):
                    logging.info('Ignoring sentence because it does not have named entities')
                    logging.info('Sentence: ' + sent_obj.original_sent )
                    continue
                # exclude quotes
                if sent_obj.clean_sent.startswith('\'\'') or sent_obj.clean_sent.startswith('``'):
                    continue

                chosen_sentences.append(sent_obj)
                total_word_count += len(sentence.split())

        with io.open(output_file_path,'w', encoding='utf8') as outputFile:
            for sentence in chosen_sentences:
                outputFile.write(sentence.original_sent)
                outputFile.write('\n')
            outputFile.flush()
        outputFile.close()

        top_sents[topic_id] = chosen_sentences

    return top_sents
コード例 #8
0
def select_top(dataset='training'):
    print('selecting top...')
    meta_regex = re.compile(
        r'^([A-Z]{2,}.{,25}\(.{,25}\))|^([A-Z\s]{2,}(\_|\-))')
    ranked_sentences = pagerank.rank(dataset)
    input_directoryPath = getDirectoryPath("outputs/pagerank_D3/devtest/")
    output_directoryPath = getDirectoryPath("../../outputs/reranker/devtest/")
    top_sents = {}
    for topic_id in ranked_sentences.keys():
        sentences = ranked_sentences[topic_id]
        id_part1 = topic_id[:-1]
        id_part2 = topic_id[-1:]
        output_file_name = id_part1 + "-A.M.100." + id_part2 + ".1"
        output_file_path = output_directoryPath + "/" + output_file_name

        vocab = set()
        for sentence in sentences:

            original = sentence.original_sent
            match = re.search(meta_regex, original)
            clean = re.sub(meta_regex, '', original).replace('--', '').lower()
            clean = compress(clean)
            sentence.clean_sent = clean
            splitted = clean.split()
            for word in splitted:
                vocab.add(word.lower())

        vocab_sentences_count = {}
        for word in vocab:
            vocab_sentences_count[word] = 0

        for sentence in sentences:
            unique_terms = set()
            for word in sentence.clean_sent.lower().split():
                unique_terms.add(word)

            for word in unique_terms:
                vocab_sentences_count[word] = vocab_sentences_count[word] + 1

        idf = {}
        for word in vocab:
            idf[word] = 1.0 + math.log(
                float(len(sentences) / float(vocab_sentences_count[word])))

        chosen_sentences = []
        total_word_count = 0
        for sent_obj in sentences:
            sentence = sent_obj.clean_sent
            if len(sentence.strip()) > 0:
                if total_word_count + len(sentence.split()) > 100:
                    continue

                # decide whether or not to include the sentence based on the cosine similarity
                include_sentence = True
                for chosen_sentence_obj in chosen_sentences:
                    chosen_sentence = chosen_sentence_obj.clean_sent
                    cosine_score = cosine(sentence.lower(),
                                          chosen_sentence.lower(), idf)
                    if cosine_score > THRESHOLD:
                        include_sentence = False
                        break

                if include_sentence:
                    #exclude quotes:
                    if not sent_obj.clean_sent.startswith(
                            '\'\'') and not sent_obj.clean_sent.startswith(
                                '``') and not sent_obj.clean_sent.startswith(
                                    '"'):
                        chosen_sentences.append(sent_obj)
                        total_word_count += len(sentence.split())

                with io.open(output_file_path, 'w',
                             encoding='utf8') as outputFile:
                    for sentence in chosen_sentences:
                        outputFile.write(sentence.clean_sent)
                        outputFile.write(' ')
                    outputFile.flush()
                outputFile.close()

                top_sents[topic_id] = chosen_sentences

    return top_sents