Ejemplo n.º 1
0
    def test_ExtractOriginalContribution(self):
        source = "abc ghi mno"
        destination = "abc def ghi jkl mno"

        ratio = util.findDiffRevised(source, destination)
        self.assertEqual(2, len(ratio))
Ejemplo n.º 2
0
def calcDiff_Enhanced(user_id, should_clean=False):
    """
      It calculates the longevity of the contribution of user in the next 50 revision
      Args:
          user_id (str): user id of the user.
      Result:
          the list of revisions contributed by user and the for each revision it has the Longevity value in no of revision and time.
       """
    try:
        with open('user_data_all/rev_list_' + user_id + '.json',
                  'r') as infile:
            updated_data = json.loads(infile.read())

        for row in updated_data:
            print("Picking For Analysis Artcile,Parent,Revision: ",
                  [row['pageid'], row['parentid'], row['revid']])
            capture_longevity = True
            current_rev = util.read_file('rev_user/' + str(row['revid']))
            if should_clean:
                current_rev = util.cleanhtml(current_rev).strip()

            if row['parentid'] == 0:
                original_text = current_rev
            else:
                parent_rev = util.read_file('rev_user/' + str(row['parentid']))
                if should_clean:
                    parent_rev = util.cleanhtml(parent_rev).strip()
                original_text = util.findDiffRevised(parent_rev, current_rev)
                original_text = list(v[1] for v in original_text)
                original_text = [w for w in original_text if len(w) > 1]

                original_text_clean = []

                for contributions in original_text:
                    sent_toks_list = util.sent_tokenize(contributions)
                    for sent_tok in sent_toks_list:
                        original_text_clean.append(
                            util.stop_word_removal(sent_tok))

                original_text = original_text_clean

                total = 0
                for txt in original_text:
                    total += len(txt)

                row['contribLength'] = total
                row['originaltext'] = original_text

                next_revs = [i for i in row['next_rev']]
                if total > 0:
                    print(original_text)
                    print(
                        "Performing Diff For Artcile,Parent,Revision: ",
                        [row['pageid'], row['parentid'], row['revid'], total])
                    index = 0
                    hasZero = False
                    lastUserID = 0
                    total_no_rev = 0

                    # finding total number of user turns
                    for rev in next_revs:
                        if rev['userid'] != lastUserID and rev[
                                'userid'] != row['userid']:
                            total_no_rev += 1
                            lastUserID = rev['userid']

                    print("Total Turns/ Contribution: ",
                          [total_no_rev, len(next_revs)])

                    lastUserID = 0

                    # finding achieved number of user turns
                    for rev in next_revs:
                        try:
                            next_rev = util.read_file('rev_user/' +
                                                      str(rev['revid']))
                            if should_clean:
                                next_rev = util.cleanhtml(next_rev)
                            d_text = util.getInsertedContentSinceParentRevision(
                                parent_rev, next_rev).strip()
                            ratio = util.textPreservedRatioBigramEnhanced(
                                original_text, d_text)

                            if rev['userid'] != lastUserID and rev[
                                    'userid'] != row['userid']:
                                index += 1
                                lastUserID = rev['userid']

                            print("ratio: ", ratio)
                            if ratio == 0 and not hasZero:
                                hasZero = True
                                row['longevityRev'] = round(
                                    index / total_no_rev, 2) * 100
                                row['matchRatio'] = ratio
                                row['totalContrib'] = total
                                print("in zero mode")
                            elif ratio >= 0.9 and hasZero:
                                hasZero = False
                                print("out zero mode")
                            if ratio < 0.90 and capture_longevity and not hasZero:
                                row['longevityRev'] = round(
                                    index / total_no_rev, 2) * 100
                                row['matchRatio'] = ratio
                                row['totalContrib'] = total
                                capture_longevity = False
                                print("longevity-S: ", index)
                                break
                        except Exception as e:
                            print("file error", e)

                    if capture_longevity and not hasZero:
                        row['longevityRev'] = round(index / total_no_rev,
                                                    2) * 100
                        row['matchRatio'] = ratio
                        row['totalContrib'] = total
                        print("longevity-L: ", index)
        if len(updated_data) > 0:
            with open('user_data_all_b/rev_list_' + user_id + '-dp.json',
                      'w') as outfile:
                json.dump(updated_data, outfile)
    except Exception as e:
        print("skipping diff as no contribution: ", e)
Ejemplo n.º 3
0
def calcDiff(user_id):
    """
      It calculates the longevity of the contribution of user in the next 50 revision
      Args:
          user_id (str): user id of the user.
      Result:
          the list of revisions contributed by user and the for each revision it has the Longevity value in no of revision and time.
       """
    try:
        with open('user_data/rev_list_' + user_id + '-o.json', 'r') as infile:
            updated_data = json.loads(infile.read())

        for row in updated_data:
            print("Picking For Analysis Artcile,Parent,Revision: ", [row['pageid'], row['parentid'], row['revid']])
            capture_longevity = True
            current_rev = util.read_file('rev_user/' + str(row['revid']))

            if row['parentid'] == 0:
                original_text = current_rev
            else:
                parent_rev = util.read_file('rev_user/' + str(row['parentid']))
                original_text = util.findDiffRevised(parent_rev, current_rev)
                original_text = list(v[1] for v in original_text)
                original_text = [w for w in original_text if len(w) > 1]
                small_text = [w for w in original_text if len(w) < 5]

                total = 0
                for txt in original_text:
                    total += len(txt)

                row['contribLength'] = total
                row['originaltext'] = original_text
                row['small_text'] = small_text

                next_revs = [i for i in row['next_rev']]
                if total > 0:
                    start_time = dateparser.parse(row['timestamp'])
                    print("Performing Diff For Artcile,Parent,Revision: ",
                          [row['pageid'], row['parentid'], row['revid'], total])
                    index = 0
                    for rev in next_revs:
                        try:
                            next_rev = util.read_file('rev_user/' + str(rev['revid']))
                            d_text = util.getInsertedContentSinceParentRevision(parent_rev, next_rev)
                            ratio = util.textPreservedRatioStrict(original_text, d_text)
                            print("ratio: ", ratio)
                            if ratio < 0.95 and capture_longevity:
                                end_time = dateparser.parse(rev['timestamp'])
                                row['longevityTime'] = round((end_time - start_time).total_seconds() / 3600, 2)
                                row['longevityRev'] = index
                                row['matchRatio'] = ratio
                                capture_longevity = False
                                print("longevity-S: ", index)
                                break
                        except Exception as e:
                            print("file error", e.message)
                            index -= 1
                        index += 1
                    if capture_longevity:
                        row['longevityRev'] = index
                        end_time = dateparser.parse(rev['timestamp'])
                        row['longevityTime'] = round((end_time - start_time).total_seconds() / 3600, 2)
                        row['matchRatio'] = ratio
                        print("longevity-L: ", index)
        if len(updated_data) > 0:
            with open('user_data_50_90_b_1/rev_list_' + user_id + '-dp.json', 'w') as outfile:
                json.dump(updated_data, outfile)
    except Exception as e:
        print("skipping diff as no contribution: ", e)