Ejemplo n.º 1
0
def calcFeatures(params):
    index, rev = params  # Multiprocessing...
    global rev_xl
    filename = "insert data path of the 2015 data from https://figshare.com/articles/English_Wikipedia_Quality_Asssessment_Dataset/1375406" + str(
        rev['revid'])
    if (os.path.exists(filename)):
        print(rev['revid'])
        text = util.read_file(filename)
        text = util.cleanhtml(text)
        text = text.replace('\'\'\'', '')
        assert rev['pageid'] == rev_xl.iloc[index, 0]
        print("matched ", rev['revid'])

        calc = readcalc.ReadCalc(text)
        textual_score = list(calc.get_all_metrics())

        text_stat = textstatistics()
        linsear_write_formula = round(text_stat.linsear_write_formula(text),2)
        textual_score.append(linsear_write_formula)

        grammar_score = len(tool.check(text))
        textual_score.append(grammar_score)

        rev_xl.iloc[index, 14:36] = textual_score

        print(rev_xl.iloc[index, :])

        if index % 10 == 0:
            rev_xl.to_csv(path)
Ejemplo n.º 2
0
    def test_removeWikiSyntax(self):

        stringWiki = "'''3rd/4th Cavalry Regiment''' is an armoured [[regiment]] of the [[Australian Army]], and is third in seniority in the [[Royal Australian Armoured Corps]]. The regiment was formed in [[1981]] through the amalgamation of the '''3rd Cavalry Regiment''' and the '''4th Cavalry Regiment'''." \
'==3rd Cavalry==' \
"The 3rd Cavalry Regiment was formed in [[1966]] as '''1st Armoured Personnel Carrier Squadron''' by the redesignation of a troop from the [[4th/19th Prince of Wales's Light Horse]], before being designated the 3rd Cavalry the same year. This unit saw extensive service throughout Australia's presence in [[Vietnam War|Vietnam]] as part of the [[Australian Task Force]]. At the time, the regiment consisted of a single squadron. However, part of A Squadron was redesignated first as 1 APC Squadron, then as B Squadron in [[1967]]-[[1968|68]]. The two squadrons swapped titles at various times until [[1971]], when A Squadron was transferred to the [[2nd Cavalry Regiment]]."\
'==4th Cavalry==' \
'The 4th Cavalry Regiment was formed in [[1971]] by the redesignation of B Squadron, [[2nd Cavalry Regiment|2nd Cavalry]] as A Squadron. This was joined in [[1975]] by a newly raised Regimental Headquarters and B Squadron. The service of the regiment was short lived, as in 1981 it was decided to amalgamate it with the 3rd Cavalry Regiment to form the 3rd/4th Cavalry Regiment.' \
'==Current Role==' \
'The 3rd/4th Cavalry Regiment currently consists of a single squadron, B Squadron. It serves as an [[armoured personnel carrier]] squadron, equipped with the [[M113]] vehicle, in the light armoured role.'\
\
"*'''Battle Honours'''"\
"**(inherited battle honours of 3rd Cavalry Regiment)"\
"**Long Tan, Bien Hoa, Coral-Balmoral, Hat Dich, Binh Ba, Vietnam 1965-72"\
\
"[[Category:Australian regiments]]"

        stringClean = ""
        stringWikiP = util.cleanhtml(stringWiki)

        self.assertEqual(stringClean, stringWikiP)
Ejemplo n.º 3
0
def calcDiff_Enhanced(user_id, should_clean=False):
    """
      It calculates the longevity of the contribution of user in the next 50 revision
      Args:
          user_id (str): user id of the user.
      Result:
          the list of revisions contributed by user and the for each revision it has the Longevity value in no of revision and time.
       """
    try:
        with open('user_data_all/rev_list_' + user_id + '.json',
                  'r') as infile:
            updated_data = json.loads(infile.read())

        for row in updated_data:
            print("Picking For Analysis Artcile,Parent,Revision: ",
                  [row['pageid'], row['parentid'], row['revid']])
            capture_longevity = True
            current_rev = util.read_file('rev_user/' + str(row['revid']))
            if should_clean:
                current_rev = util.cleanhtml(current_rev).strip()

            if row['parentid'] == 0:
                original_text = current_rev
            else:
                parent_rev = util.read_file('rev_user/' + str(row['parentid']))
                if should_clean:
                    parent_rev = util.cleanhtml(parent_rev).strip()
                original_text = util.findDiffRevised(parent_rev, current_rev)
                original_text = list(v[1] for v in original_text)
                original_text = [w for w in original_text if len(w) > 1]

                original_text_clean = []

                for contributions in original_text:
                    sent_toks_list = util.sent_tokenize(contributions)
                    for sent_tok in sent_toks_list:
                        original_text_clean.append(
                            util.stop_word_removal(sent_tok))

                original_text = original_text_clean

                total = 0
                for txt in original_text:
                    total += len(txt)

                row['contribLength'] = total
                row['originaltext'] = original_text

                next_revs = [i for i in row['next_rev']]
                if total > 0:
                    print(original_text)
                    print(
                        "Performing Diff For Artcile,Parent,Revision: ",
                        [row['pageid'], row['parentid'], row['revid'], total])
                    index = 0
                    hasZero = False
                    lastUserID = 0
                    total_no_rev = 0

                    # finding total number of user turns
                    for rev in next_revs:
                        if rev['userid'] != lastUserID and rev[
                                'userid'] != row['userid']:
                            total_no_rev += 1
                            lastUserID = rev['userid']

                    print("Total Turns/ Contribution: ",
                          [total_no_rev, len(next_revs)])

                    lastUserID = 0

                    # finding achieved number of user turns
                    for rev in next_revs:
                        try:
                            next_rev = util.read_file('rev_user/' +
                                                      str(rev['revid']))
                            if should_clean:
                                next_rev = util.cleanhtml(next_rev)
                            d_text = util.getInsertedContentSinceParentRevision(
                                parent_rev, next_rev).strip()
                            ratio = util.textPreservedRatioBigramEnhanced(
                                original_text, d_text)

                            if rev['userid'] != lastUserID and rev[
                                    'userid'] != row['userid']:
                                index += 1
                                lastUserID = rev['userid']

                            print("ratio: ", ratio)
                            if ratio == 0 and not hasZero:
                                hasZero = True
                                row['longevityRev'] = round(
                                    index / total_no_rev, 2) * 100
                                row['matchRatio'] = ratio
                                row['totalContrib'] = total
                                print("in zero mode")
                            elif ratio >= 0.9 and hasZero:
                                hasZero = False
                                print("out zero mode")
                            if ratio < 0.90 and capture_longevity and not hasZero:
                                row['longevityRev'] = round(
                                    index / total_no_rev, 2) * 100
                                row['matchRatio'] = ratio
                                row['totalContrib'] = total
                                capture_longevity = False
                                print("longevity-S: ", index)
                                break
                        except Exception as e:
                            print("file error", e)

                    if capture_longevity and not hasZero:
                        row['longevityRev'] = round(index / total_no_rev,
                                                    2) * 100
                        row['matchRatio'] = ratio
                        row['totalContrib'] = total
                        print("longevity-L: ", index)
        if len(updated_data) > 0:
            with open('user_data_all_b/rev_list_' + user_id + '-dp.json',
                      'w') as outfile:
                json.dump(updated_data, outfile)
    except Exception as e:
        print("skipping diff as no contribution: ", e)
Ejemplo n.º 4
0
    def test_bigramMethodEnhancedClean(self):

        o_text = ['Old Hittite has th', 'of', "where Cuneiform Luwian instead uses the ''-ssa'' adjectival suffix. ", ', on the northern border of both, like later', 'eroglyphic Luwian has', "an ''-as'' genitive and an ''-asa'' adjectival suffix. Palaic also shows the same gender distinction as seen in Hittite, i.e. animate vs. inanimate; and has similar pronoun forms. Therefore Palaic is thought to belong to the Anatolian languages, although whether as a sister language to Old Hittite or Cuneiform Luwian is un", 'nown']

        d_text = "\n"\
", the capital of the HittitesHittite language|(Anatolia)|Pala  only1Emmanuel   , the leading God of the land of PalIn particular, ain Hittite foriparwa and associated deities, includes passages stating, \"The Old Woman speaks the words of the bread in Palaic,\" or alternately \"the words of the meal,\" though no Palaic passages are quoted. The Palaic-language texts are all from a religious context, with ritual and mythological content.arruba, O. ''Das Palaische. exte, Grammatik, Lexikon.'' Wiesbaden: arrassowitz, 190. StBoT 10 Old Hittite has thofwhere Cuneiform script|Cuneiform Luwian instead uses the ''-ssa'' adjectival suffix. , on the northern border of both, like later Anatolian hieroglyph|eroglyphic Luwian hasan ''-as'' genitive and an ''-asa'' adjectival suffix. Palaic also shows the same gender distinction as seen in Hittite, i.e. animate vs. inanimate; and has similar pronoun forms. Therefore Palaic is thought to belong to the Anatolian languages, although whether as a sister language to Old Hittite or Cuneiform Luwian is unnown"\
        "\n"\
        "\n"\
        "\n"\
        "\n"\
        "\n"\


        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.74)

        o_text = ['(2001) ', ' is that terrapin nests and were found last year,', ', Hampshire)', ' very', '<em>', '</em>']

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        d_text = util.cleanhtml("The '''''', ''Trachemys scripta elegans'' is native to the southern [[nited States]], and has become common in the UKIt is a medium-ration to a tortoise, ranging in sizeKeared terrapins are not native,,(2001)  is that terrapin nests and eggs were found last year,t[[snapping turtles]]6, Hampshire) very'''', tterrapins are members of the group [[]], referring to reptiles with a shell, which contains")

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 1.0)

        o_text = ['Cant See this weather. I am fine', 'intersteller was nice movie']
        d_text = util.cleanhtml('I am fine, how about you? this weather is really hot and humid. ' \
                 'One needs to hydrate regularly to survive.')

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.12)

        o_text = ['One needs to hydrate regularly to survive.', 'this weather really']

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 1.0)

        o_text = ['One needs', 'two three']

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 0.5)

        o_text = ['One needs to regularly to survive.']

        for i in range(len(o_text)):
            o_text[i] = util.cleanhtml(o_text[i])

        ratio = util.textPreservedRatioBigramEnhanced(o_text, d_text)

        self.assertEqual(ratio, 1.0)