Beispiel #1
0
    def prepare_testing_datasets_wiki(self, file_list_wiki, rd_folder_path):
        total_data = 0
        actual_error = pd.DataFrame(columns=['actual', 'error'])
        for rf in file_list_wiki:
            if rf.endswith(".json"):
                try:
                    revision_list = json.load(
                        io.open(os.path.join(rd_folder_path, rf),
                                encoding="utf-8"))
                    for one_item in revision_list:
                        old_value = str(one_item[0]['old_value'].strip())
                        old_value = remove_markup(str(old_value))
                        old_value = re.sub('[^a-zA-Z0-9.-]+', ' ', old_value)
                        old_value = old_value.strip()

                        new_value = str(one_item[0]['new_value'].strip())
                        new_value = remove_markup(str(new_value))
                        new_value = re.sub('[^a-zA-Z0-9.-]+', ' ', new_value)
                        new_value = new_value.strip()
                        if old_value and new_value and old_value != " " and new_value != " " and len(
                                old_value
                        ) > 3 and len(
                                new_value
                        ) > 3 and old_value != "none" and new_value != "none" and old_value != "None" and new_value != "None":
                            actual_error.loc[-1] = [new_value, old_value]
                            actual_error.index = actual_error.index + 1  # shifting index
                            actual_error = actual_error.sort_index()
                            total_data = total_data + 1
                except Exception as e:
                    print('Exception from wiki: ', str(e))
        print("total_data: ", total_data)
        return actual_error
Beispiel #2
0
 def prepare_domain_testing_datasets_wiki(self, file_list_wiki,
                                          rd_folder_path, domain_type):
     total_data = 0
     if domain_type == "location":
         domain_location = [
             'Country', 'COUNTRY', 'country', 'CITY', 'City', 'city',
             'Location', 'LOCATION', 'location', 'Place', 'PLACE', 'place',
             'VENUE', 'venue', 'Venue', 'Town', 'town', 'TOWN',
             'birth_place', 'death_place'
         ]
     actual_error = pd.DataFrame(columns=['actual', 'error'])
     for rf in file_list_wiki:
         if rf.endswith(".json"):
             try:
                 revision_list = json.load(
                     io.open(os.path.join(rd_folder_path, rf),
                             encoding="utf-8"))
                 for one_item in revision_list:
                     if domain_location:
                         if one_item[0][
                                 'errored_column'] in domain_location:
                             old_value = str(
                                 one_item[0]['old_value'].strip())
                             old_value = remove_markup(str(old_value))
                             old_value = re.sub('[^a-zA-Z0-9.-]+', ' ',
                                                old_value)
                             old_value = old_value.strip()
                             new_value = str(
                                 one_item[0]['new_value'].strip())
                             new_value = remove_markup(str(new_value))
                             new_value = re.sub('[^a-zA-Z0-9.-]+', ' ',
                                                new_value)
                             new_value = new_value.strip()
                             if old_value and new_value and old_value != " " and new_value != " " and len(
                                     old_value
                             ) > 3 and len(
                                     new_value
                             ) > 3 and old_value != "none" and new_value != "none" and old_value != "None" and new_value != "None":
                                 actual_error.loc[-1] = [
                                     new_value, old_value
                                 ]
                                 actual_error.index = actual_error.index + 1  # shifting index
                                 actual_error = actual_error.sort_index()
                                 total_data = total_data + 1
             except Exception as e:
                 print('Exception from wiki: ', str(e))
     print("Total data to repair: ", total_data)
     return actual_error
Beispiel #3
0
 def prepare_datasets_retrain_wiki(
         self, file_list_wiki, rd_folder_path):  ###only for edit distance
     train_data_rows = []
     for rf in file_list_wiki:
         if rf.endswith(".json"):
             try:
                 revision_list = json.load(
                     io.open(os.path.join(rd_folder_path, rf),
                             encoding="utf-8"))
                 one_item = revision_list[-1]
                 #old_value=str(one_item[0]['old_value'].strip())
                 #new_value=str(one_item[0]['new_value'].strip())
                 dirty_table = one_item[0]['dirty_table']
                 for index, row in enumerate(dirty_table):
                     if index == 0:
                         continue
                     row = remove_markup(str(row))
                     row = ast.literal_eval(row)
                     row = list(filter(None, row))
                     row = [
                         x for x in row if not any(x1.isdigit() for x1 in x)
                     ]
                     if row:
                         row = [
                             re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in row
                         ]
                         train_data_rows.extend(row)
             except Exception as e:
                 print('Exception: ', str(e))
     return train_data_rows
 def infobox_parsing(self):
     """
     This method will extract all infobox templates with revision
     """
     infobox_count = 0
     templates = self.code.filter_templates()
     for temp in templates:
         json_list = []
         if "Infobox" in temp.name:
             self.revision_page_folder_path = os.path.join(
                 self.rdd_folder_path, self.page_folder)
             if not os.path.exists(self.revision_page_folder_path):
                 os.mkdir(self.revision_page_folder_path)
             infobox_folder = remove_markup(str(temp.name))
             infobox_folder = re.sub('[^a-zA-Z0-9\n\.]', ' ',
                                     (str(infobox_folder)).lower())
             revision_infobox_folder_path = os.path.join(
                 self.revision_page_folder_path, infobox_folder)
             if not os.path.exists(revision_infobox_folder_path):
                 os.mkdir(revision_infobox_folder_path)
             json_list.append(str(temp))
             json.dump(
                 json_list,
                 open(
                     os.path.join(
                         revision_infobox_folder_path,
                         self.revision_id_parent + '_' +
                         self.revision_id_current + ".json"), "w"))
             print(temp.name)
             infobox_count = infobox_count + 1
     return infobox_count
Beispiel #5
0
 def identity(self) -> typing.Optional[str]:  # type: ignore[return]
     """
     Returns Identity Section.
     """
     idtparse = parse(self.main_content)
     for i in range(1, 7):
         idt = idtparse.get_sections(include_subsections=False, level=i)
         if idt != [] and (idt[0].title).lower() == "identity":
             return remove_markup(idt[0].contents)
Beispiel #6
0
    def background(self) -> typing.Optional[str]:  # type: ignore[return]
        """
        Returns Background Section.
        """
        bgparse = parse(self.main_content)

        for i in range(1, 7):
            bg = bgparse.get_sections(include_subsections=False, level=i)

            if bg != [] and (bg[0].title).lower() == "background":
                to_clean = bg[0].contents
                self.helper.content = to_clean
                return remove_markup(self.helper.clean_bg())
    def table_parsing(self):
        """
        This method will extract all table templates with revision
        """
        table_count = 0
        if self.table:
            for tebil in self.table:
                json_list = []
                try:
                    #TAG_RE=re.compile(r'<[^>]+>')
                    #TAG_RE.sub(' ', name)
                    table_caption = wtp.parse(str(tebil)).tables[0].caption
                    table_folder_name = remove_markup(str(table_caption))
                    table_folder_name = table_folder_name.lower()
                    table_folder_name = table_folder_name.strip()

#table_folder_name=TAG_RE.sub(' ', table_folder_name)
                except Exception as e:
                    print('Exception: table folder name ', str(e))
                    #print(str(e))
                    continue

                if table_caption:
                    try:
                        self.revision_page_folder_path = os.path.join(
                            self.rdd_folder_path, self.page_folder)
                        if not os.path.exists(self.revision_page_folder_path):
                            os.mkdir(self.revision_page_folder_path)
                        table_folder_name = table_folder_name.strip('\n')
                        revision_table_folder_path = os.path.join(
                            self.revision_page_folder_path, table_folder_name)
                        revision_table_folder_path = revision_table_folder_path.strip(
                        )
                        if not os.path.exists(revision_table_folder_path):
                            os.mkdir(revision_table_folder_path)
                    except Exception as e:
                        print('Exception: revision table folder', str(e))
                        continue
                    table_count = table_count + 1
                    json_list.append(str(tebil))
                    json.dump(
                        json_list,
                        open(
                            os.path.join(
                                revision_table_folder_path,
                                self.revision_id_parent + '_' +
                                self.revision_id_current + ".json"), "w"))
                    print('Table caption: ', table_folder_name)
                    table_count = table_count + 1

        return table_count
Beispiel #8
0
def search(text):
  regex = re.compile(r'==\s*Vaata ka\s*')
  text = regex.split(text)[0]
  regex = re.compile(r'==\s*Kirjandus\s*==')
  text = regex.split(text)[0]
  regex = re.compile(r'==\s*Viited\s*==')
  text = regex.split(text)[0]
  regex = re.compile(r'==\s*Välislingid\s*==')
  text = regex.split(text)[0]
  code = mw.parse(trim_unnessessary_spaces(text))
 
  for tag in code.filter_tags(recursive=False):
    if tag[0] == "{" and tag[-1] == "}" or tag.tag == 'gallery' or tag.tag == 'imagemap' or tag.tag == 'center' or tag.tag == 'ref': #or tag.tag == 'table':
      code.replace(tag,"")
    else:
      code.replace(tag,tag.contents)
  
  for argument in code.filter_arguments():
    code.replace(argument, "")
  
  for comment in code.filter_comments():
    code.replace(comment,"")
  
  for external_link in code.filter_external_links():
    code.replace(external_link,"")
  
  for heading in code.filter_headings():
    code.replace(heading,"")
  
  for html_entity in code.filter_html_entities():
    code.replace(html_entity, html_entity.normalize())
  
  for template in code.filter_templates(recursive=False):
    code.replace(template,"")
  
  for wikilink in code.filter_wikilinks(recursive=False):
    if bool(re.match("(File|Fail|Pilt|Image):.+\.(SVG|svg|JPEG|jpeg|GIF|gif|PNG|png|JPG|jpg)",str(wikilink.title))):
      code.replace(wikilink,"")

  answer = remove_markup(str(code))

  splitted = answer.split('\n')
  for i,item in enumerate(splitted):
    splitted[i]=re.sub("\n", "",splitted[i])
  answer = ''.join(splitted)
  return answer
Beispiel #9
0
    def prepare_wiki_datasets_finetune(self, file_list_wiki, rd_folder_path,
                                       domain_type):
        train_data_rows = []
        if domain_type == "location":
            domain_location = [
                'Country', 'COUNTRY', 'country', 'CITY', 'City', 'city',
                'Location', 'LOCATION', 'location', 'Place', 'PLACE', 'place',
                'VENUE', 'venue', 'Venue', 'Town', 'town', 'TOWN',
                'birth_place', 'death_place'
            ]
        for rf in file_list_wiki:
            if rf.endswith(".json"):
                try:
                    revision_list = json.load(
                        io.open(os.path.join(rd_folder_path, rf),
                                encoding="utf-8"))
                    one_item = revision_list[-1]
                    if domain_location:
                        if one_item[0]['errored_column'] in domain_location:
                            dirty_table = one_item[0]['dirty_table']
                            for index, row in enumerate(dirty_table):
                                if index == 0:
                                    continue
                                row = remove_markup(str(row))
                                row = ast.literal_eval(row)
                                row = list(filter(None, row))
                                row = [
                                    x for x in row
                                    if not any(x1.isdigit() for x1 in x)
                                ]
                                if row:
                                    row = [
                                        re.sub('[^a-zA-Z0-9.-]+', ' ', _)
                                        for _ in row
                                    ]
                                    train_data_rows.append(row)

                except Exception as e:
                    print('Exception from wiki: ', str(e))
        txt = ""
        c = [[' '.join(i)] for i in train_data_rows]
        for sentence in c:
            txt = txt + sentence[0] + " ."
        with open("train_bert_wiki.txt", "w") as output:
            output.write(txt)
Beispiel #10
0
 def masking_error_value(self, filelist, rd_folder_path):
     error_value = []
     vicinity_with_amsk = []
     mask = "[MASK]"
     for rf in filelist:
         if rf.endswith(".json"):
             try:
                 revision_list = json.load(
                     io.open(os.path.join(rd_folder_path, rf),
                             encoding="utf-8"))
                 #one_item=revision_list[-1]
                 #old_value=str(one_item[0]['old_value'].strip())
                 #new_value=str(one_item[0]['new_value'].strip())
                 for one_item in revision_list:
                     vicinity = one_item[0]['vicinity']
                     vicinity = remove_markup(str(vicinity))
                     vicinity = ast.literal_eval(vicinity)
                     vicinity = list(filter(None, vicinity))
                     #print(vicinity[0])
                     error_value = one_item[0]['old_value']
                     #print('Before preprocess: ',vicinity)
                     error_value = self.preprocess_text(error_value)
                     #vicinity=preprocess_text(error_value)
                     vicinity = [
                         self.preprocess_text(item) for item in vicinity
                     ]
                     print('Before Masking: ', vicinity)
                     print('Error value : ', error_value)
                     error_value = error_value.strip()
                     vicinity = [
                         mask if str(x).strip() == str(error_value) else x
                         for x in vicinity
                     ]
                     #vicinity = vicinity.replace(error_value, '**mask**')
                     print('After masking : ', vicinity)
             except Exception as e:
                 print('Exception: ', str(e))
Beispiel #11
0
import untangle
import wikitextparser as wtp
from wikitextparser import remove_markup, parse

wiki_file = "./Wikipedia.xml"

wiki_obj = untangle.parse(wiki_file)

stories = wiki_obj.mediawiki.page

#wiki_page = wtp.parse(stories[1].revision.text.cdata)

for story in stories:
    story_text = wtp.parse(story.revision.text.cdata)
    plain_text = ""
    for section in story_text.sections:
        title = str(section.title).strip()
        #print("+"+title+"+")
        if title == "Plot" or title == "Synopsis":
            try:
                plain_text = remove_markup(section.string)

            except:
                pass
    print(plain_text)
    def diff_check_revision(self):
        create_revision_list = []
        table_column_current = None
        table_column_previous = None
        code_current = mwparserfromhell.parse(self.current_revision_file[0],
                                              skip_style_tags=True)
        code_previous = mwparserfromhell.parse(self.previous_revision_file[0],
                                               skip_style_tags=True)
        try:
            ########### Current revision table  data extraction
            table1 = code_current.filter_tags(
                matches=lambda node: node.tag == "table")
            table_code_current = wtp.parse(str(table1[0])).tables[0]
            table_data_current = table_code_current.data()
            table_column_current = table_data_current[0]
            ########## previous revision table data extraction
            table2 = code_previous.filter_tags(
                matches=lambda node: node.tag == "table")
            table_code_previous = wtp.parse(str(table2[0])).tables[0]
            table_data_previous = table_code_previous.data()
            table_column_previous = table_data_previous[0]
            df_data = DataFrame(table_data_previous)
            header = df_data.iloc[0]
            new_column_list = header.tolist()
            df_data = df_data[1:]
            df_data.columns = header
        except Exception as e:
            print('Exception from table data: ', str(e))
        if table_column_current and table_column_previous and len(
                table_column_previous) == len(set(table_column_previous)):
            self.table_count_with_error = self.table_count_with_error + 1
            if len(table_column_current) == len(table_column_previous):
                text1 = table_data_previous
                text2 = table_data_current

                if text1 and text2:
                    for index1, (txt1,
                                 txt2) in enumerate(zip(text1,
                                                        text2)):  #row parsing
                        if index1 == 0:
                            continue
                        d = difflib.Differ()
                        for index, (cell1, cell2) in enumerate(zip(
                                txt1, txt2)):  # values of row parsing
                            create_revision_dict = {}
                            old_value = None
                            new_value = None
                            cell1 = remove_markup(str(cell1))
                            cell2 = remove_markup(str(cell2))
                            cell1 = cell1.strip()
                            cell2 = cell2.strip()
                            #print(cell1)
                            #print(cell2)
                            if cell1 and cell2:
                                diff1 = d.compare([''.join(cell1)], [cell2])
                                try:
                                    if diff1:
                                        for line in diff1:
                                            #print(line)
                                            #print('###############################################################################')
                                            if not line.startswith(' '):
                                                if line.startswith('-'):
                                                    old_value = line[1:]
                                                if line.startswith('+'):
                                                    new_value = line[1:]
                                        if old_value and new_value:
                                            #table_column_current1=remove_markup(str(table_column_current))
                                            txt1 = remove_markup(str(txt1))
                                            old_value = remove_markup(
                                                str(old_value))
                                            new_value = remove_markup(
                                                str(new_value))
                                            column_name = new_column_list[
                                                index]
                                            column_name = str(column_name)
                                            #print(column_name)
                                            #print(type(column_name))

                                            column_values = df_data[
                                                column_name].tolist()
                                            column_values = remove_markup(
                                                str(column_values))
                                            #value = html.unescape(value)
                                            #new_value = re.sub("[\t\n ]+", " ", new_value, re.UNICODE)
                                            #value = value.strip("\t\n ")
                                            cleanr = re.compile('<.*?>')

                                            all_column = list(df_data.columns)
                                            #all_column=html.unescape(str(all_column))
                                            #all_column=remove_markup(str(all_column))
                                            all_column = re.sub(
                                                cleanr, ' ', str(all_column))
                                            all_column = remove_markup(
                                                all_column)
                                            column_name = re.sub(
                                                cleanr, ' ', str(column_name))
                                            column_name = remove_markup(
                                                column_name)
                                            if len(old_value) < 50 and len(
                                                    new_value) < 50:
                                                create_revision_dict = {
                                                    "columns": all_column,
                                                    "domain": column_values,
                                                    "vicinity": txt1,
                                                    "errored_column":
                                                    column_name,
                                                    "old_value": old_value,
                                                    "new_value": new_value
                                                }
                                                create_revision_list.append(
                                                    create_revision_dict)
                                                print('column: ', column_name,
                                                      'old cell: ', old_value,
                                                      'new_cell: ', new_value)
                                except Exception as e:
                                    print('Exception from revised value: ',
                                          str(e))
        return create_revision_list
def test_nested_bold_or_italic_plain_text():
    assert remove_markup("''[[a|''b'']]") == 'b'
    assert remove_markup("'''[[a|'''b''']]") == 'b'
def test_remove_markup():
    assert remove_markup("''a'' {{b}} c <!----> '''d'''") == "a  c  d"
    def error_correction_fasttext_with_retrain_wiki(self, model_type,
                                                    datasets_type, dataparam_1,
                                                    dataparam_2):
        total_error = 0
        total_error_to_repaired = 0
        total_repaired = 0
        if model_type == "Fasttext_All_Domain":  #every time it will load the pretrained model to test new wiki table
            error_correction = self.prepare_testing_datasets_wiki(
                dataparam_1, dataparam_2
            )  #dataparam_1 : json_list, dataparam_2: path of json_filelist
            model_fasttext = FastText.load("model/Fasttext_All_Domain.w2v")
        if model_type == "Fasttext_CV_Fold":
            model_fasttext = FastText.load("model/Fasttext_CV_Fold.w2v")
        if model_type == "Fasttext_Domain_Location":
            model_fasttext = FastText.load(
                "model/Fasttext_Location_Domain.w2v")
            error_correction = self.prepare_domain_testing_datasets_wiki(
                dataparam_1, dataparam_2, "location")
            total_error = self.calculate_total_error_wiki(
                dataparam_1, dataparam_2)
        if datasets_type == "wiki":
            train_data_rows = []
            for rf in dataparam_1:
                if rf.endswith(".json"):
                    try:
                        revision_list = json.load(
                            io.open(os.path.join(dataparam_2, rf),
                                    encoding="utf-8"))
                        one_item = revision_list[-1]
                        old_value = str(one_item[0]['old_value'].strip())
                        new_value = str(one_item[0]['new_value'].strip())
                        vicinity = one_item[0]['vicinity']
                        vicinity = remove_markup(str(vicinity))
                        vicinity = ast.literal_eval(vicinity)
                        #print('Before: ',vicinity)
                        train_vicinity_index = vicinity.index(old_value)
                        del vicinity[train_vicinity_index]
                        vicinity.append(new_value)
                        vicinity = [
                            x for x in vicinity
                            if not any(x1.isdigit() for x1 in x)
                        ]
                        vicinity = [x for x in vicinity if len(x) != 0
                                    ]  #remove empty item from list
                        #vicinity=[re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in vicinity]
                        #print('After: ', vicinity)
                        #row=list(filter(None, row))
                        dirty_table = one_item[0]['dirty_table']
                        for index, row in enumerate(dirty_table):
                            if index == 0:
                                continue
                            shape = len(row)
                            row = remove_markup(str(row))
                            row = ast.literal_eval(row)
                            row = list(filter(None, row))
                            #remove all digit
                            row = [
                                x for x in row
                                if not any(x1.isdigit() for x1 in x)
                            ]
                            row = [x for x in row if len(x) != 0
                                   ]  #remove empty item from list
                            if row:
                                row = [
                                    re.sub('[^a-zA-Z0-9.-]+', ' ', _)
                                    for _ in row
                                ]
                                train_data_rows.append(row)
                    except Exception as e:
                        print('Exception: ', str(e))
            if train_data_rows:
                model_fasttext.build_vocab(train_data_rows, update=True)
                model_fasttext.train(sentences=train_data_rows,
                                     total_examples=len(train_data_rows),
                                     epochs=5)
            for error_value, actual_value in zip(error_correction['error'],
                                                 error_correction['actual']):
                try:
                    if model_type == "Fasttext_Domain_Location":
                        pass
                    else:
                        total_error = total_error + 1

                    if not any(x1.isdigit() for x1 in error_value):
                        total_error_to_repaired = total_error_to_repaired + 1
                        similar_value = model_fasttext.most_similar(
                            error_value)
                        #print('Actual value: ',  actual_value,'Most similar value of : ',error_value, ' ' , similar_value)
                        first, b = similar_value[0]
                        #print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value)
                        first = first.strip()
                        actual_value = actual_value.strip()
                        if first == actual_value:
                            print('Error : ', error_value, ' Fixed: ', first,
                                  ' Actual: ', actual_value)
                            total_repaired = total_repaired + 1
                except:
                    continue
        print(total_error, total_error_to_repaired, total_repaired)
        model_type = model_type + ' retrain wiki '
        self.evaluate_model(model_type, total_error, total_error_to_repaired,
                            total_repaired)