Beispiel #1
0
    def test_bigramMethodEnhanced(self):

        o_text = [
            '(2001) ',
            ' is that terrapin nests and eggs were found last year,',
            ', Hampshire)', ' very', '<em>', '</em>'
        ]
        d_text = "The '''''', ''Trachemys scripta elegans'' is native to the southern [[nited States]], and has become common in the UKIt is a medium-ration to a tortoise, ranging in sizeKeared terrapins are not native,,(2001)  is that terrapin nests and eggs were found last year,t[[snapping turtles]]6, Hampshire) very'''', tterrapins are members of the group [[]], referring to reptiles with a shell, which contains"

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.99)

        o_text = [
            'Cant See this weather. I am fine', 'intersteller was nice movie'
        ]
        d_text = 'I am fine, how about you? this weather is really hot and humid. ' \
                 'One needs to hydrate regularly to survive.'

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.12)

        o_text = [
            'One needs to hydrate regularly to survive.', 'this weather really'
        ]

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 1.0)

        o_text = ['One needs', 'two three']

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.5)

        o_text = ['One needs to regularly to survive.']

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 1.0)
def calcDiff_Enhanced(user_id, should_clean=False):
    """
      It calculates the longevity of the contribution of user in the next 50 revision
      Args:
          user_id (str): user id of the user.
      Result:
          the list of revisions contributed by user and the for each revision it has the Longevity value in no of revision and time.
       """
    try:
        with open('user_data_all/rev_list_' + user_id + '.json',
                  'r') as infile:
            updated_data = json.loads(infile.read())

        for row in updated_data:
            print("Picking For Analysis Artcile,Parent,Revision: ",
                  [row['pageid'], row['parentid'], row['revid']])
            capture_longevity = True
            current_rev = util.read_file('rev_user/' + str(row['revid']))
            if should_clean:
                current_rev = util.cleanhtml(current_rev).strip()

            if row['parentid'] == 0:
                original_text = current_rev
            else:
                parent_rev = util.read_file('rev_user/' + str(row['parentid']))
                if should_clean:
                    parent_rev = util.cleanhtml(parent_rev).strip()
                original_text = util.findDiffRevised(parent_rev, current_rev)
                original_text = list(v[1] for v in original_text)
                original_text = [w for w in original_text if len(w) > 1]

                original_text_clean = []

                for contributions in original_text:
                    sent_toks_list = util.sent_tokenize(contributions)
                    for sent_tok in sent_toks_list:
                        original_text_clean.append(
                            util.stop_word_removal(sent_tok))

                original_text = original_text_clean

                total = 0
                for txt in original_text:
                    total += len(txt)

                row['contribLength'] = total
                row['originaltext'] = original_text

                next_revs = [i for i in row['next_rev']]
                if total > 0:
                    print(original_text)
                    print(
                        "Performing Diff For Artcile,Parent,Revision: ",
                        [row['pageid'], row['parentid'], row['revid'], total])
                    index = 0
                    hasZero = False
                    lastUserID = 0
                    total_no_rev = 0

                    # finding total number of user turns
                    for rev in next_revs:
                        if rev['userid'] != lastUserID and rev[
                                'userid'] != row['userid']:
                            total_no_rev += 1
                            lastUserID = rev['userid']

                    print("Total Turns/ Contribution: ",
                          [total_no_rev, len(next_revs)])

                    lastUserID = 0

                    # finding achieved number of user turns
                    for rev in next_revs:
                        try:
                            next_rev = util.read_file('rev_user/' +
                                                      str(rev['revid']))
                            if should_clean:
                                next_rev = util.cleanhtml(next_rev)
                            d_text = util.getInsertedContentSinceParentRevision(
                                parent_rev, next_rev).strip()
                            ratio = util.textPreservedRatioBigramEnhanced(
                                original_text, d_text)

                            if rev['userid'] != lastUserID and rev[
                                    'userid'] != row['userid']:
                                index += 1
                                lastUserID = rev['userid']

                            print("ratio: ", ratio)
                            if ratio == 0 and not hasZero:
                                hasZero = True
                                row['longevityRev'] = round(
                                    index / total_no_rev, 2) * 100
                                row['matchRatio'] = ratio
                                row['totalContrib'] = total
                                print("in zero mode")
                            elif ratio >= 0.9 and hasZero:
                                hasZero = False
                                print("out zero mode")
                            if ratio < 0.90 and capture_longevity and not hasZero:
                                row['longevityRev'] = round(
                                    index / total_no_rev, 2) * 100
                                row['matchRatio'] = ratio
                                row['totalContrib'] = total
                                capture_longevity = False
                                print("longevity-S: ", index)
                                break
                        except Exception as e:
                            print("file error", e)

                    if capture_longevity and not hasZero:
                        row['longevityRev'] = round(index / total_no_rev,
                                                    2) * 100
                        row['matchRatio'] = ratio
                        row['totalContrib'] = total
                        print("longevity-L: ", index)
        if len(updated_data) > 0:
            with open('user_data_all_b/rev_list_' + user_id + '-dp.json',
                      'w') as outfile:
                json.dump(updated_data, outfile)
    except Exception as e:
        print("skipping diff as no contribution: ", e)
Beispiel #3
0
    def test_bigramMethodEnhancedClean(self):

        o_text = ['Old Hittite has th', 'of', "where Cuneiform Luwian instead uses the ''-ssa'' adjectival suffix. ", ', on the northern border of both, like later', 'eroglyphic Luwian has', "an ''-as'' genitive and an ''-asa'' adjectival suffix. Palaic also shows the same gender distinction as seen in Hittite, i.e. animate vs. inanimate; and has similar pronoun forms. Therefore Palaic is thought to belong to the Anatolian languages, although whether as a sister language to Old Hittite or Cuneiform Luwian is un", 'nown']

        d_text = "\n"\
", the capital of the HittitesHittite language|(Anatolia)|Pala  only1Emmanuel   , the leading God of the land of PalIn particular, ain Hittite foriparwa and associated deities, includes passages stating, \"The Old Woman speaks the words of the bread in Palaic,\" or alternately \"the words of the meal,\" though no Palaic passages are quoted. The Palaic-language texts are all from a religious context, with ritual and mythological content.arruba, O. ''Das Palaische. exte, Grammatik, Lexikon.'' Wiesbaden: arrassowitz, 190. StBoT 10 Old Hittite has thofwhere Cuneiform script|Cuneiform Luwian instead uses the ''-ssa'' adjectival suffix. , on the northern border of both, like later Anatolian hieroglyph|eroglyphic Luwian hasan ''-as'' genitive and an ''-asa'' adjectival suffix. Palaic also shows the same gender distinction as seen in Hittite, i.e. animate vs. inanimate; and has similar pronoun forms. Therefore Palaic is thought to belong to the Anatolian languages, although whether as a sister language to Old Hittite or Cuneiform Luwian is unnown"\
        "\n"\
        "\n"\
        "\n"\
        "\n"\
        "\n"\


        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.74)

        o_text = ['(2001) ', ' is that terrapin nests and were found last year,', ', Hampshire)', ' very', '<em>', '</em>']

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        d_text = util.cleanhtml("The '''''', ''Trachemys scripta elegans'' is native to the southern [[nited States]], and has become common in the UKIt is a medium-ration to a tortoise, ranging in sizeKeared terrapins are not native,,(2001)  is that terrapin nests and eggs were found last year,t[[snapping turtles]]6, Hampshire) very'''', tterrapins are members of the group [[]], referring to reptiles with a shell, which contains")

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 1.0)

        o_text = ['Cant See this weather. I am fine', 'intersteller was nice movie']
        d_text = util.cleanhtml('I am fine, how about you? this weather is really hot and humid. ' \
                 'One needs to hydrate regularly to survive.')

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.12)

        o_text = ['One needs to hydrate regularly to survive.', 'this weather really']

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 1.0)

        o_text = ['One needs', 'two three']

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.5)

        o_text = ['One needs to regularly to survive.']

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 1.0)