Beispiel #1
0
    def test_shouldContainMethodWork(self):

        o_text = ['I am fine, how about you?', 'One needs to hydrate regularly.']
        d_text = 'I am fine, how about you? this weather is really hot and humid. ' \
                 'One needs to hydrate regularly to survive.'

        d_sent_token = util.sent_tokenize(d_text)

        sent_tokens = util.sent_tokenize(o_text[0])
        for sent_token in sent_tokens:
            self.assertTrue(sent_token in d_sent_token)

        sent_tokens = util.sent_tokenize(o_text[1])
        for sent_token in sent_tokens:
            self.assertFalse(sent_token in d_sent_token)
Beispiel #2
0
    def test_shouldSentenceTokenWork(self):

        o_text = ['I am fine. how about you? the weather is hot']
        right_tokens = ['I am fine.', 'how about you?', 'the weather is hot']

        for text in o_text:
            sent_tokens = util.sent_tokenize(text)
            for (token, right_token) in zip(sent_tokens, right_tokens):
                self.assertEqual(token, right_token)
Beispiel #3
0
    def test_shouldContainMethodWithWordWork(self):

        o_text = ['One needs to hydrate regularly.', 'intersteller was nice movie']
        d_text = 'I am fine, how about you? this weather is really hot and humid. ' \
                 'One needs to hydrate regularly to survive.'

        d_sent_token = util.sent_tokenize(d_text)
        d_word_token = util.word_tokenize(d_text)

        sent_tokens = util.sent_tokenize(o_text[0])
        for sent_token in sent_tokens:
            self.assertFalse(sent_token in d_sent_token)
            for word in util.word_tokenize(sent_token):
                self.assertTrue(word in d_word_token)

        sent_tokens = util.sent_tokenize(o_text[1])
        for sent_token in sent_tokens:
            self.assertFalse(sent_token in d_sent_token)
            for word in util.word_tokenize(sent_token):
                self.assertFalse(word in d_word_token)
def calcDiff_Enhanced(user_id, should_clean=False):
    """
      It calculates the longevity of the contribution of user in the next 50 revision
      Args:
          user_id (str): user id of the user.
      Result:
          the list of revisions contributed by user and the for each revision it has the Longevity value in no of revision and time.
       """
    try:
        with open('user_data_all/rev_list_' + user_id + '.json',
                  'r') as infile:
            updated_data = json.loads(infile.read())

        for row in updated_data:
            print("Picking For Analysis Artcile,Parent,Revision: ",
                  [row['pageid'], row['parentid'], row['revid']])
            capture_longevity = True
            current_rev = util.read_file('rev_user/' + str(row['revid']))
            if should_clean:
                current_rev = util.cleanhtml(current_rev).strip()

            if row['parentid'] == 0:
                original_text = current_rev
            else:
                parent_rev = util.read_file('rev_user/' + str(row['parentid']))
                if should_clean:
                    parent_rev = util.cleanhtml(parent_rev).strip()
                original_text = util.findDiffRevised(parent_rev, current_rev)
                original_text = list(v[1] for v in original_text)
                original_text = [w for w in original_text if len(w) > 1]

                original_text_clean = []

                for contributions in original_text:
                    sent_toks_list = util.sent_tokenize(contributions)
                    for sent_tok in sent_toks_list:
                        original_text_clean.append(
                            util.stop_word_removal(sent_tok))

                original_text = original_text_clean

                total = 0
                for txt in original_text:
                    total += len(txt)

                row['contribLength'] = total
                row['originaltext'] = original_text

                next_revs = [i for i in row['next_rev']]
                if total > 0:
                    print(original_text)
                    print(
                        "Performing Diff For Artcile,Parent,Revision: ",
                        [row['pageid'], row['parentid'], row['revid'], total])
                    index = 0
                    hasZero = False
                    lastUserID = 0
                    total_no_rev = 0

                    # finding total number of user turns
                    for rev in next_revs:
                        if rev['userid'] != lastUserID and rev[
                                'userid'] != row['userid']:
                            total_no_rev += 1
                            lastUserID = rev['userid']

                    print("Total Turns/ Contribution: ",
                          [total_no_rev, len(next_revs)])

                    lastUserID = 0

                    # finding achieved number of user turns
                    for rev in next_revs:
                        try:
                            next_rev = util.read_file('rev_user/' +
                                                      str(rev['revid']))
                            if should_clean:
                                next_rev = util.cleanhtml(next_rev)
                            d_text = util.getInsertedContentSinceParentRevision(
                                parent_rev, next_rev).strip()
                            ratio = util.textPreservedRatioBigramEnhanced(
                                original_text, d_text)

                            if rev['userid'] != lastUserID and rev[
                                    'userid'] != row['userid']:
                                index += 1
                                lastUserID = rev['userid']

                            print("ratio: ", ratio)
                            if ratio == 0 and not hasZero:
                                hasZero = True
                                row['longevityRev'] = round(
                                    index / total_no_rev, 2) * 100
                                row['matchRatio'] = ratio
                                row['totalContrib'] = total
                                print("in zero mode")
                            elif ratio >= 0.9 and hasZero:
                                hasZero = False
                                print("out zero mode")
                            if ratio < 0.90 and capture_longevity and not hasZero:
                                row['longevityRev'] = round(
                                    index / total_no_rev, 2) * 100
                                row['matchRatio'] = ratio
                                row['totalContrib'] = total
                                capture_longevity = False
                                print("longevity-S: ", index)
                                break
                        except Exception as e:
                            print("file error", e)

                    if capture_longevity and not hasZero:
                        row['longevityRev'] = round(index / total_no_rev,
                                                    2) * 100
                        row['matchRatio'] = ratio
                        row['totalContrib'] = total
                        print("longevity-L: ", index)
        if len(updated_data) > 0:
            with open('user_data_all_b/rev_list_' + user_id + '-dp.json',
                      'w') as outfile:
                json.dump(updated_data, outfile)
    except Exception as e:
        print("skipping diff as no contribution: ", e)