Ejemplo n.º 1
0
    def tokenize_string_column(self,
                               delimiter_pattern_in_literal_cells,
                               string_column_name,
                               id_column_name=None):
        """
        Tokenizes string literals by assigning one splitted part (e.g., keyword) per row. The input dataframe must have
        at most two columns: one column containing strings to be tokenized, and one column contaning ids(optional)

        Args:
            delimiter_pattern_in_literal_cells(str)
            string_column_name(str)
            id_column_name(str)

        Returns:
            Data_Frame (updated self)

        Examples:
            >>> import pandas

            >>> #===== TOKENIZING A SINGLE-COLUMN DATAFRAME ============================================================
            >>> # Make a single-column dataframe:
            >>> df = pandas.DataFrame({'the only column': ('a; b', 'c; d; e')})
            >>> my_Data_Frame = Data_Frame(df)
            >>> print(my_Data_Frame.dataframe)
              the only column
            0            a; b
            1         c; d; e

            >>> # Tokenize strings in single-column dataframe
            >>> my_Data_Frame.tokenize_string_column(string_column_name='the only column',
            ...                                      delimiter_pattern_in_literal_cells='; ')\
            .dataframe
              the only column
            0               a
            1               b
            2               c
            3               d
            4               e

            >>> #===== TOKENIZING A TWO-COLUMN DATAFRAME ===============================================================
            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':['literal one; literal two', 'literal three; literal four'],
            ...      'id_column': ['id 1', 'id 2']
            ... })

            >>> # Tokenize and view the dataframe
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                    id_column_name='id_column',
            ...                                                    delimiter_pattern_in_literal_cells='; ')\
               .dataframe
              id_column literal_column
            0      id 1    literal one
            1      id 1    literal two
            2      id 2  literal three
            3      id 2   literal four


            >>> #===== TOKENIZING WITH REMOVAL OF SPACES BEFORE/AFTER TOKENS ===========================================
            >>> # Unwanted spaces occur when a single character (e.g., ',') is provided as delimiter instead of ('; ').

            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':['literal one ; literal two', 'literal three; literal four '],
            ...      'id_column': ['id 1', 'id 2']
            ... })

            >>> # Tokenize and view the dataframe
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                    id_column_name='id_column',
            ...                                                    delimiter_pattern_in_literal_cells=';')\
               .dataframe
              id_column literal_column
            0      id 1    literal one
            1      id 1    literal two
            2      id 2  literal three
            3      id 2   literal four


            >>> #===== TOKENIZING IN CASES WHERE DELIMITERS ARE AT HEAD AND TAIL =======================================
            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':['tail issue a; tail issue b;', ';head issue a; head issue b', ';both issues a; both issues b;', 'no issues a; no issues b'],
            ...      'id_column': ['id 1', 'id 2', 'id 3', 'id 4']
            ... })

            >>> # Tokenize and view the dataframe
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                    id_column_name='id_column',
            ...                                                    delimiter_pattern_in_literal_cells='; ')\
               .dataframe
              id_column literal_column
            0      id 1   tail issue a
            1      id 1   tail issue b
            2      id 2   head issue a
            3      id 2   head issue b
            4      id 3  both issues a
            5      id 3  both issues b
            6      id 4    no issues a
            7      id 4    no issues b

            >>> #===== TOKENIZING IN CASES WHERE DELIMITER(S) ARE THE ENTIRE STRING=====================================
            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':[';;;', ';;', ';', '; ;', 'non-problematic a; non-problematic b'],
            ...      'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id 5']
            ... })

            >>> # Tokenize and view the dataframe
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                    id_column_name='id_column',
            ...                                                    delimiter_pattern_in_literal_cells='; ')\
               .dataframe
              id_column     literal_column
            0      id 5  non-problematic a
            1      id 5  non-problematic b

            >>> #===== A REAL WORLD TWO-COLUMN EXAMPLE =================================================================
            >>> # Create a dataframe
            >>> my_dataframe = pandas.DataFrame({'wosKeywords': ['Clinical Neurology; Orthopedics', 'Biology; Mathematical & Computational Biology', 'Physics, Nuclear', 'Plant Sciences'],
            ...                                  'articleId': ['wosres:WOS_000071013000007', 'wosres:WOS_000071018600001', 'wosres:WOS_000071021600006', 'wosres:WOS_000071040300005']})
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
                                articleId                                    wosKeywords
            0  wosres:WOS_000071013000007                Clinical Neurology; Orthopedics
            1  wosres:WOS_000071018600001  Biology; Mathematical & Computational Biology
            2  wosres:WOS_000071021600006                               Physics, Nuclear
            3  wosres:WOS_000071040300005                                 Plant Sciences

            >>> # Tokenize the string column
            >>> my_Data_Frame.tokenize_string_column(string_column_name='wosKeywords',
            ...                                                           id_column_name='articleId',
            ...                                                           delimiter_pattern_in_literal_cells='; ')\
                .dataframe
                                articleId                           wosKeywords
            0  wosres:WOS_000071013000007                    Clinical Neurology
            1  wosres:WOS_000071013000007                           Orthopedics
            2  wosres:WOS_000071018600001                               Biology
            3  wosres:WOS_000071018600001  Mathematical & Computational Biology
            4  wosres:WOS_000071021600006                      Physics, Nuclear
            5  wosres:WOS_000071040300005                        Plant Sciences

            >>> #===== ERROR: DATAFRAME HAS TOO MANY COLUMNS ===========================================================
            >>> # Create a simple dataframe
            >>> my_dataframe = pandas.DataFrame({
            ...      'literal_column':['literal one; literal two', 'literal three; literal four'],
            ...     'id_column': ['id 1', 'id 2'],
            ...     'third_column': ['abc', 'xyz']
            ... })
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
              id_column               literal_column third_column
            0      id 1     literal one; literal two          abc
            1      id 2  literal three; literal four          xyz

            >>> # Error: The input dataframe has too many columns:
            >>> try: my_Data_Frame.tokenize_string_column(string_column_name='literal_column',
            ...                                                         id_column_name='id_column',
            ...                                                         delimiter_pattern_in_literal_cells='; ')
            ... except IndexError as exception:  # catch exception
            ...     print (exception)
            'tokenize_string_column' method can only take a Pandas.DataFrame with two columns. The current number of columns is 3.

        """
        import pandas
        from preprocessor.string_tools import String

        number_of_columns = self.dataframe.shape[1]
        if number_of_columns > 2:
            raise IndexError(
                "'tokenize_string_column' method can only take a Pandas.DataFrame with two columns. "
                "The current number of columns is %s." % number_of_columns)

        # get index positions of columns
        index_of_literal_column = self.dataframe.columns.get_loc(
            string_column_name)
        if id_column_name:
            index_of_id_column = self.dataframe.columns.get_loc(id_column_name)

        # tokenize literals at row level
        literal_column = self.dataframe[string_column_name]
        splitted_literal_column = literal_column.str.split(
            delimiter_pattern_in_literal_cells)
        # update the column
        self.dataframe[string_column_name] = splitted_literal_column

        # create blank dataframe for output
        original_column_names = list(self.dataframe.columns)
        output_dataframe = pandas.DataFrame(columns=original_column_names)

        # create a new row for each tokenized literal
        for each_row_number, each_row in self.dataframe.iterrows():

            column_names = each_row.index.values
            row_values = each_row.values

            for each_literal in row_values[index_of_literal_column]:

                # Clean from unwanted spaces at head and tail of tokens
                each_literal = String(each_literal)
                each_literal.clean_head_and_tail_iteratively_from_characters(
                    ' ')
                each_literal.clean_head_and_tail_iteratively_from_characters(
                    delimiter_pattern_in_literal_cells)
                each_literal = str(each_literal)

                if len(
                        each_literal
                ) > 0:  # do not allow empty rows to be part of the output dataframe

                    if id_column_name:
                        output_dataframe.loc[len(output_dataframe)] = (
                            row_values[index_of_id_column], each_literal)
                    else:
                        output_dataframe.loc[len(output_dataframe)] = (
                            each_literal)
                else:
                    pass

        self.dataframe = output_dataframe
        return self
Ejemplo n.º 2
0
    def purify_column(self, target_column_name):
        """
        Cleans the specified column from undesirable characters.

        Args:
            target_column_name(str): Column to be cleaned

        Returns:
            Data_Frame (updated self)

        Examples:
            >>> import pandas

            >>> # CLEAN A COLUMN =======================================================================================
            >>> # Create Data_Frame
            >>> my_dataframe = pandas.DataFrame({
            ...             'dirty_column':['{string} & one','String, "two"','[string] - three','(string) /\ four;'],
            ...             'id_column': ['id 1', 'id 2', 'id 3', 'id 4'],
            ...             'another_column': ['abc', 'mno', 'pqr', 'xyz']})
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
              another_column       dirty_column id_column
            0            abc     {string} & one      id 1
            1            mno      String, "two"      id 2
            2            pqr   [string] - three      id 3
            3            xyz  (string) /\ four;      id 4

            >>> # Clean the column
            >>> my_Data_Frame.purify_column('dirty_column')\
                             .dataframe
              another_column    dirty_column id_column
            0            abc  string and one      id 1
            1            mno     String, two      id 2
            2            pqr  string - three      id 3
            3            xyz     string four      id 4
            >>> #=======================================================================================================


            >>> # EXCEPTION: COLUMN MUST CONSIST OF STRINGS ============================================================
            >>> # Create a column that is made of integers
            >>> my_dataframe = pandas.DataFrame({
            ...      'integer_column':[1,
            ...                      2,
            ...                      3,
            ...                      4
            ...      ]
            ... })
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
               integer_column
            0               1
            1               2
            2               3
            3               4

            >>> # Fail to purify integer column
            >>> try:
            ...     my_Data_Frame.purify_column('integer_column')
            ... except Exception as exception:  # catch exception
            ...     print (exception)
            The target column "integer_column" must be of dtype "object". It is currently of dtype "int64".
            >>> #=======================================================================================================

        """
        from preprocessor.string_tools import String

        target_column = self.dataframe[target_column_name]

        # Target column must be made of strings
        self._force_column_type(target_column_name=target_column_name,
                                dtype='object')  # 'O' stands for 'object'
        # a string columns is categorized as 'object'

        conversion_dictionary = {
            '/': '',
            ';':
            '',  # sometimes, a semicolon seems to be at the end of keywords (e.g.,instead of "kw1; kw2; kw3"
            # "kw1; kw2; kw3;")
            '&': 'and',
            '\(|\)': '',  # ()
            '\[|\]': '',  # []
            '\{|\}': '',  # {}
            '  ':
            ' '  # clean from double spaces (may occur after cleaning other characters)
        }

        # Purify each string in the column
        for i, each_item in enumerate(target_column):
            each_String = String(each_item)
            each_String.purify(clean_from_non_ascii_characters=True,
                               remove_problematic_patterns=True,
                               clean_newline_characters=True)
            each_String.replace_patterns(conversion_dictionary)
            target_column.loc[i] = each_String.content

        return self
Ejemplo n.º 3
0
    def clean_heads_and_tails_of_cells_in_column_from_patterns(
            self, target_column_name, patterns_to_remove, location):
        """
        Cleans the specified strings in the column from specified characters at the heads, tails (or at both locations).

        Args:
            target_column_name(str): Column to be cleaned
            patterns_to_remove(list): A list of strings containing patterns to remove.

        Keyword Args:
            head (patterns_to_remove): Cleans the beginning of the string from specified patterns
            tail (patterns_to_remove): Cleans the end of the string
            ends (patterns_to_remove): Cleans both the beginning and end of the string

        Returns:
            Data_Frame (updated self)

        Examples:
            >>> # INIT =================================================================================================
            >>> # Create Data_Frame
            >>> import pandas as pd
            >>> my_dataframe = pd.DataFrame({
            ...             'dirty_column':[';head issue','tail issue;',';both issues;',';complex situation; head',
            ...             'complex situation; tail;', ';complex situation; both;'],
            ...             'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id5', 'id6'],
            ...             'another_column': ['abc', 'def', 'mno', 'pqr', 'stu', 'xyz']})
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
              another_column               dirty_column id_column
            0            abc                ;head issue      id 1
            1            def                tail issue;      id 2
            2            mno              ;both issues;      id 3
            3            pqr   ;complex situation; head      id 4
            4            stu   complex situation; tail;       id5
            5            xyz  ;complex situation; both;       id6

            # CLEAN HEAD ===============================================================================================
            >>> # Clean the heads of strings (without touching the same pattern elsewhere)
            >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column', [';'], 'head')\
                             .dataframe
              another_column              dirty_column id_column
            0            abc                head issue      id 1
            1            def               tail issue;      id 2
            2            mno              both issues;      id 3
            3            pqr   complex situation; head      id 4
            4            stu  complex situation; tail;       id5
            5            xyz  complex situation; both;       id6

            # CLEAN TAIL ===============================================================================================
            >>> # Clean the tails of strings (without touching the same pattern elsewhere)
            >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column', [';'], 'tail')\
                             .dataframe
              another_column             dirty_column id_column
            0            abc               head issue      id 1
            1            def               tail issue      id 2
            2            mno              both issues      id 3
            3            pqr  complex situation; head      id 4
            4            stu  complex situation; tail       id5
            5            xyz  complex situation; both       id6

            # CLEAN BOTH ===============================================================================================
            >>> # Recreate Data_Frame
            >>> import pandas as pd
            >>> my_dataframe = pd.DataFrame({
            ...             'dirty_column':[';head issue','tail issue;',';both issues;',';complex situation; head',
            ...             'complex situation; tail;', ';complex situation; both;'],
            ...             'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id5', 'id6'],
            ...             'another_column': ['abc', 'def', 'mno', 'pqr', 'stu', 'xyz']})
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
              another_column               dirty_column id_column
            0            abc                ;head issue      id 1
            1            def                tail issue;      id 2
            2            mno              ;both issues;      id 3
            3            pqr   ;complex situation; head      id 4
            4            stu   complex situation; tail;       id5
            5            xyz  ;complex situation; both;       id6

            >>> # Clean both the heads and tails of strings (without touching the same pattern elsewhere)
            >>> # Note that when the target it 'both', removal proceeds ONLY if the pattern exists at both head and tail
            >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column',
            ...                                                                      patterns_to_remove=[';'],
            ...                                                                      location='both')\
                                                                                    .dataframe
              another_column              dirty_column id_column
            0            abc               ;head issue      id 1
            1            def               tail issue;      id 2
            2            mno               both issues      id 3
            3            pqr  ;complex situation; head      id 4
            4            stu  complex situation; tail;       id5
            5            xyz   complex situation; both       id6

            >>> # EXCEPTION: COLUMN MUST CONSIST OF STRINGS ============================================================
            >>> # Create a column that is made of integers
            >>> my_dataframe = pd.DataFrame({
            ...      'integer_column':[1,
            ...                      2,
            ...                      3,
            ...                      4
            ...      ]
            ... })
            >>> my_Data_Frame = Data_Frame(my_dataframe)
            >>> my_Data_Frame.dataframe
               integer_column
            0               1
            1               2
            2               3
            3               4

            >>> # Fail to clean integer column from characters
            >>> try:
            ...     my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('integer_column', [';'], 'head')
            ... except Exception as exception:  # catch exception
            ...     print (exception)
            The target column "integer_column" must be of dtype "object". It is currently of dtype "int64".
            >>> #=======================================================================================================
    """
        from preprocessor.string_tools import String

        target_column = self.dataframe[target_column_name]

        # Target column must be made of strings
        self._force_column_type(target_column_name=target_column_name,
                                dtype='object')  # 'O' stands for 'object'
        # a string columns is categorized as 'object'

        # Clean each string in the column from the specified characters
        for i, each_item in enumerate(target_column):
            each_String = String(each_item)

            each_String.clean_head_and_tail_from_patterns(
                patterns_to_remove=patterns_to_remove, location=location)
            target_column.loc[i] = each_String.content

        return self
Ejemplo n.º 4
0
###############################################################

my_row = CSV_Line(' "a" , "b" , "c" ,').clean_from_newline_characters().\
    clean_head_and_tail_from_patterns(' ', 'head').\
    clean_head_and_tail_from_patterns(' ,', 'tail').\
    parse_line_and_CONVERT_to_CSV_Row(' , ').\
    clean_cell_heads_and_tails_from_characters('"')  # usage of chain methods/fluid interface is optional

print(my_row)

my_row.format_for_print_and_CONVERT_to_CSV_Line(column_separator=' , ',
                                                line_head=' ',
                                                line_tail=' ,',
                                                cell_wrapper='"')

String('-----STRING=======').clean_head_and_tail_iteratively_from_characters(
    '-=')
String('ABC123').clip_at_index(4, remove='tail')

##########################################################
####################### CSV EXPORT #######################
##########################################################

demo_bibliography = Bibliography()
demo_bibliography.importBibtex('examples/example_data/IDR_Literature_WOS.bib')
demo_bibliography.exportToCsv(
    output_file_path='examples//example_data//demo_output.csv',
    columns_to_ignore=[
        'b_document', 'b_authors', 'b_topics', 'b_journal',
        'b_publication_month', 'b_issue_number', 'b_volume', 'b_pages',
        'b_pure_bibliography_id'
    ],
"""
Retrieves all articles from OpenCitations that has the same DOI with the records in VU and UvA bibliographies.
"""

# parse list from file (probably exists in ListData)
from retriever.sparql_tools import Open_Citations_Query
from meta.consoleOutput import ConsoleOutput
from preprocessor.string_tools import String

console = ConsoleOutput('log.txt')

doi_list = []
with open('Input//all_dois_in_uva_and_vu_bibliographies.csv',
          encoding='utf8') as doi_file:
    for each_line in doi_file:
        each_line = String(each_line)
        each_line.clean_from_newline_characters()
        doi_list.append(str(each_line))

oc_query = Open_Citations_Query()
oc_query.retrieve_articles_by_dois(doi_list, show_progress_bar=True)
oc_query.write_results_to_csv('Input//oc_articles_with_matching_dois_v1.3.csv')

# A demo list with 100 DOIs
# doi_list = ['10.1163/187607508X384689', '10.1017/S0954579416000572', '10.1007/s11562-016-0353-7', '10.1016/j.adolescence.2016.09.008', '10.1186/s13561-016-0122-6', '10.1007/s00799-016-0182-6', '10.5194/gmd-2016-266', '10.1007/s00737-015-0531-2', '10.1103/RevModPhys.88.021003', 'https://doi.org/10.1101/167171', 'https://doi.org/10.1016/j.chb.2017.04.047', '10.1016/j.trb.2016.09.005', '10.1016/j.ancene.2016.01.001', '10.1111/adb.12322', '10.1017/njg.2016.45', '10.1080/1359432X.2016.1209489', '10.1117/1.JBO.21.6.066008', '10.5194/gmd-10-3329-2017', '10.1016/j.rser.2017.01.103', '10.1177/2050157916664559', '10.1007/978-3-319-45931-8_17', '10.1007/s11136-015-1171-8', '10.1145/2991079.2991121', '10.1093/cz/zow089', '10.1126/science.aac8167', '10.1007/s00586-016-4606-1', '10.1186/s12937-017-0229-6', '10.1007/s11357-016-9894-1', '10.1080/00130095.2015.1094371', '10.1016/j.epsl.2016.02.028', '10.1371/journal.pone.0168636', '10.1016/j.atmosres.2016.03.016', '10.1111/deci.12206', '10.1126/science.aad9634', '10.1103/PhysRevA.94.012506', '10.4103/0019-5545.196846', '10.1016/j.cedpsych.2017.01.006', '10.3324/haematol.2015.133470', '10.1057/978-1-137-50956-7', '10.1016/j.scico.2016.04.001', 'https://doi.org/10.1016/j.scico.2016.04.001', '10.1080/03081087.2015.1053425', '10.3758/s13423-017-1270-3', '10.1681/ASN.2015030287', '10.1016/j.avb.2016.05.006', '10.1177/0971333616689191', '10.1002/sej.1243', '10.1016/j.foreco.2017.06.023', '10.1103/PhysRevLett.118.071801', 'https://doi.org/10.1093/geront/gnv127', '10.1007/978-3-319-42324-1_16', '10.1109/JBHI.2015.2412656', '10.1016/j.jeem.2016.04.002', '10.1080/00207543.2015.1058982', '10.1038/mp.2016.100', '10.1080/03003930.2016.1194267', '10.1016/j.envint.2017.01.018', '10.1038/pr.2015.179', '10.1177/1753193416669263', '10.1016/j.tre.2016.11.003', '10.1021/acs.jpcc.5b12016', '10.1002/anie.201603510', '10.1073/pnas.1607005113', '(DOI) - 10.1111/cch.12521', '10.1017/S0016756815000886', '10.1080/1350293X.2015.1073507', '10.1152/jn.00701.2015', '10.1371/journal.pone.0170791', '10.1016/j.seares.2016.07.005', '10.1016/j.reseneeco.2016.03.003', '10.1007/s00531-017-1499-0', '10.1007/s41669-017-0014-7', '10.1093/acrefore/9780190228613.013.439', '10.14814/phy2.13201', '10.1016/j.jtrangeo.2016.10.013', '10.1523/JNEUROSCI.3658-16.2017', '10.1192/bjpo.bp.115.000166', '10.1136/bmjgh-2016-000109', '10.7554/eLife.20320.001', '10.1037/pas0000332', '10.1177/1474704916673841', '10.1057/978-1-137-58179-2', '10.1002/ejp.963', '10.1017/thg.2016.78', '10.1038/tpj.2016.32', '10.1016/j.jesp.2017.03.008', '10.1287/trsc.2015.0647', '10.1186/s13015-016-0087-3', '10.1016/j.neuroimage.2016.10.030', '10.1371/journal.pone.0169109', '10.1007/s11367-017-1358-z', '10.1080/1369183X.2015.1061425', '10.2196/mental.4614', '10.1002/arp.1564', '10.1021/acs.orglett.6b01023', '10.3847/1538-4357/aa6c47', 'http://www.socialevraagstukken.nl/veiligheid-creeer-je-met-geborgenheid/', '10.1186/s12888-016-0790-0', '10.1371/journal.pone.0155755', '10.1103/PhysRevLett.116.241801']
Ejemplo n.º 6
0
    def get_line_at_position_from_file(self, line_number):
        """
        Returns a specified line from the TextFile without reading the whole file into memory.

        Args:
            line_index(int): A value that can take integers starting from 0.

        Returns:
            String class object (created from string at line in file).

        See Also:
            CSV_File.get_line_at_position_from_file

        Examples:
            >>> # return first line of file
            >>> my_file = Text_File('test_data//example_merged_yasgui_1000.csv')
            >>> my_file.get_line_at_position_from_file(1)
            '"publication_type" , "journal_article" , "title" , "publication_year" , "author_name" , "journal_name" , "journal_issue_number" , "journal_volume_number" , "startEndPages" , "publisher_name" , "doi" , "cited_by_article" ,'

            >>> # return another line
            >>> my_file.get_line_at_position_from_file(122)
            '"Journal Article" , "https://w3id.org/oc/corpus/br/3448" , "Perioperative Myocardial Infarction" , "2009" , "Beattie - W. S. | Mosseri - M. | Jaffe - A. S. | Alpert - J. S." , "Circulation" , "22" , "119" , "2936--2944" , "Ovid Technologies (Wolters Kluwer Health)" , "10.1161/circulationaha.108.828228" , "https://w3id.org/oc/corpus/br/3426" ,'

            >>> # return last line
            >>> my_file.get_line_at_position_from_file(267)
            '"Journal Article" , "https://w3id.org/oc/corpus/br/3437" , "Myocardial Injury after Noncardiac Surgery" , "2014" , "Niebrzegowska - Edyta | Benton - Sally | Wragg - Andrew | Archbold - Andrew | Smith - Amanda | McAlees - Eleanor | Ramballi - Cheryl | MacDonald - Neil | Januszewska - Marta | Shariffuddin - Ina I. | Vasanthan - V. | Hashim - N. H. M. | Undok - A. Wahab | Ki - Ushananthini | Lai - Hou Yee | Ahmad - Wan Azman | Ackland - Gareth | Khan - Ahsun | Almeida - Smitha | Cherian - Joseph | Furruqh - Sultana | Abraham - Valsa | Paniagua - Pilar | Urrutia - Gerard | Maestre - Mari Luz | Santaló - Miquel | Gonzalez - Raúl | Font - Adrià | Martínez - Cecilia" , "Anesthesiology" , "3" , "120" , "564--578" , "Ovid Technologies (Wolters Kluwer Health)" , "10.1097/aln.0000000000000113" , "https://w3id.org/oc/corpus/br/3522 | https://w3id.org/oc/corpus/br/300243 | https://w3id.org/oc/corpus/br/3062326 | https://w3id.org/oc/corpus/br/3271454 | https://w3id.org/oc/corpus/br/3879533 | https://w3id.org/oc/corpus/br/4205354 | https://w3id.org/oc/corpus/br/5253819 | https://w3id.org/oc/corpus/br/6332120 | https://w3id.org/oc/corpus/br/7799424 | https://w3id.org/oc/corpus/br/8003885 | https://w3id.org/oc/corpus/br/8185544" ,'

            >>> # erroneous index number entered (0)
            >>> # return first line of file
            >>> my_file = Text_File('test_data//example_merged_yasgui_1000.csv')
            >>> try:
            ...     my_file.get_line_at_position_from_file(0) #  line_number cannot be 0
            ... except Exception as error_message:
            ...     print('Exception: ' + str(error_message))
            Exception: Parameter value must be a positive integer but is "0" of <class 'int'>.


            >>> # erroneous index number entered (too high)
            >>> try:
            ...     my_file.get_line_at_position_from_file(300) # there is no 300th line in the file
            ... except IndexError as error_message:
            ...     print('Exception: ' + str(error_message))
            Exception: Requested line number '300' does not exist in file.
        """
        from preprocessor.string_tools import String, Parameter_Value
        Parameter_Value(line_number).force_positive_integer()

        with open(self.input_file_path, encoding='utf8') as input_file:
            line = None

            for i, each_line in enumerate(input_file):
                current_iteration_step = i + 1  # to align index numbers (starting from 0) and line numbers (start from 1)
                if current_iteration_step == line_number:
                    line = String(each_line)
                elif current_iteration_step > line_number:
                    break

            if line == None:
                raise IndexError(
                    "Requested line number '%s' does not exist in file." %
                    line_number)

            # if not cleaned from '\n', comparisons and operations tend to be problematic
            # write to file with base print() function to get back the new line in the end
            line.clean_from_newline_characters()

            return line
Ejemplo n.º 7
0
    def is_each_row_balanced(self, exclude_special_rows_of_syntax=None):
        """
        Checks whether each row in buffer is balanced (i.e., does not have unmatched parantheses, brackets, etc). Can
        exclude special row types (e.g., comment) from evaluation.

        Args:
            exclude_special_rows_of_syntax(str): specifies what type of rows to exclude from evaluation
                (e.g., comment rows). Uses predefined syntax settings per specified syntax (e.g., 'bibtex').

        Keyword Args:
            - bibtex (exclude_special_rows_of_syntax): sets evaluation exclusion criteria for bibtex syntax

        Returns:
            boolean

        Examples:
            >>> # an unbalanced row is present
            >>> my_buffer = ListBuffer()
            >>> my_buffer.append_row(['a', 'b', 'c']).append_row(['d', 'e', 'f']).dataset
            [['a', 'b', 'c'], ['d', 'e', 'f']]
            >>> my_buffer.append_row(['g', 'h' , '>'])\
                    .is_each_row_balanced()
            False

            >>> # single row from a bib file
            >>> my_buffer = ListBuffer()
            >>> my_buffer.append_row('            year      = "2017",')\
                .is_each_row_balanced()
            True

            >>> # bibtex entry start (no exception vs. exception)
            >>> my_buffer.append_row('@article{96d9add3e2f44e8abbf030170689bc30,')\
                .is_each_row_balanced()
            False
            >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex')
            True

            >>> # bibtex comment (no exception vs. exception)
            >>> my_buffer = ListBuffer()
            >>> my_buffer.append_row('% This is a comment with an unbalanced characters }]>')\
                .is_each_row_balanced()
            False
            >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex')
            True

            >>> # a full bibtex entry with an unbalanced curly bracket at title field
            >>> my_buffer = ListBuffer()
            >>> my_buffer.dataset = ['@book{a82caf00e1a143759c7f5543b6c84ea5,', 'title     = "{Knowledge Representation for Health Care (AIME 2015 International Joint Workshop, KR4HC/ProHealth 2015)",', 'author    = "D Riano and R. Lenz and S Miksch and M Peleg and M. Reichert and {ten Teije}, A.C.M.",', 'year      = "2015",', 'doi       = "10.1007/978-3-319-26585-8",', 'isbn      = "9783319265841",', 'series    = "LNAI",', 'publisher = "Springer",', 'number    = "9485",', '}', '']
            >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex')  # error
            False
            >>> # the same entry with unbalanced curly bracket removed
            >>> my_buffer.dataset = ['@book{a82caf00e1a143759c7f5543b6c84ea5,', 'title     = "Knowledge Representation for Health Care (AIME 2015 International Joint Workshop, KR4HC/ProHealth 2015)",', 'author    = "D Riano and R. Lenz and S Miksch and M Peleg and M. Reichert and {ten Teije}, A.C.M.",', 'year      = "2015",', 'doi       = "10.1007/978-3-319-26585-8",', 'isbn      = "9783319265841",', 'series    = "LNAI",', 'publisher = "Springer",', 'number    = "9485",', '}', '']
            >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex')
            True

        """

        from preprocessor.string_tools import String

        buffer = self.dataset

        is_balanced_log = []

        for each_row in buffer:
            each_row = String(str(each_row))

            if not each_row.is_balanced():
                # print('row is not balanced: ', each_row)
                ### EXCLUSIONS FOR BIBTEX ###########################################
                if exclude_special_rows_of_syntax == 'bibtex':
                    # print('special syntax = bibtex recognized')
                    # forgive these row types
                    if each_row.is_line_type('bibtex', 'start of entry') \
                            or each_row.is_line_type('bibtex', 'end of entry') \
                            or each_row.is_line_type('bibtex', 'comment'):
                        is_balanced_log.append(True)
                        # print("01: appended True to log, because the row is unbalanced but it passed exclusion rules", "the current row (each_row) is: ", "(", type(each_row) ,")",  each_row)
                    else:
                        is_balanced_log.append(False)

                ######################################################################
                else:
                    is_balanced_log.append(False)
                    # print("02: appended False to log because row is unbalanced (no exclusion keyword specified) ", "the current row (each_row) is: ", "(", type(each_row) ,")",  each_row)

            else:
                is_balanced_log.append(True)
                # print("03: appended True to log because row is balanced ", "the current row (each_row) is: ", "(", type(each_row) ,")",  each_row)

        if False in is_balanced_log:
            return False
        else:
            return True
Ejemplo n.º 8
0
    def cleanAndTokenizeCsv(instance):
        """
        Imports the .csv file as raw text, cleans it (if cleaning algorithm is specified), and then tokenizes it.

        Returns:
            List containing parsed data from the .csv file. For each row in the .csv file (including headers row), a
                sub-list is created in the main list.

        Examples:
            >>> from preprocessor.Text_File import Text_File
            >>> my_file = Text_File('example_data//problematic_yasgui_csv_file.csv')
            >>> my_file.print_lines(2)
            "27624462" , "2016" , "Journal Article" , "Duku - Stephen Kwasi☆☆ Opokué | Asenso-Boadi - Francis" , "[]{}\ '<Utilization, of, healthcare services and renewal of health insurance membership: evidence of adverse selection in Ghana" , "Springer Science + Business Media" , "1" , "Health Econ Rev - Health Economics Review" , "http://dx.doi.org/10.1186/s13561-016-0122-6" , "6" , "10.1186/s13561-016-0122-6" , "" , "https://w3id.org/oc/corpus/br/3555801" , "https://w3id.org/oc/corpus/br/18754 | https://w3id.org/oc/corpus/br/18792" ,


            >>> my_csv_bibliography = CSV_Bibliography(
            ...                           csv_file_path='example_data//problematic_yasgui_csv_file.csv',
            ...                           id_column_header='journal_article',
            ...                           field_value_list_separator=' | ',
            ...                           csv_delimiter_character=',',
            ...                           cleaning_algorithm='default'
            ... )
            Conversion from ListData to Bibliography object started
            Conversion completed. 2 out of 2 ListData rows converted to Bibliography object entries
            >>> my_csv_bibliography.preview(1) # notice the character conversions in the 'authors' and 'title' fields
            <BLANKLINE>
            ----------------------------------ENTRY 1----------------------------------
            ('https://w3id.org/oc/corpus/br/3555801',
             {'': '',
              'authors': ['Duku - Stephen Kwasiaa OpokuAS', 'Asenso-Boadi - Francis'],
              'cited_by_the_articles': '',
              'cited_the_articles': ['https://w3id.org/oc/corpus/br/18754',
                                     'https://w3id.org/oc/corpus/br/18792'],
              'doi': '10.1186/s13561-016-0122-6',
              'journal_article': 'https://w3id.org/oc/corpus/br/3555801',
              'journal_issue_number': '1',
              'journal_name': 'Health Econ Rev - Health Economics Review',
              'journal_volume_number': '6',
              'pmid': '27624462',
              'publication_type': 'Journal Article',
              'publication_year': '2016',
              'publisher_name': 'Springer Science + Business Media',
              'title': ' Utilization-of-healthcare services and renewal of health '
                       'insurance membership: evidence of adverse selection in Ghana',
              'url': 'http://dx.doi.org/10.1186/s13561-016-0122-6'})
            <BLANKLINE>
        """
        import re
        import csv
        from os import remove as os_remove
        from preprocessor.string_tools import String

        # open the csv file and read it to a variable
        imported_file_raw = open(instance.csv_file_path,
                                 mode="r",
                                 encoding="utf8")
        imported_string_raw = imported_file_raw.read()

        # if no cleaning algorithm is specified, skip cleaning and just tokenize
        if instance.cleaning_algorithm == 'parse only':
            imported_string_cleaned = imported_string_raw

        # otherwise, run cleaning algorithm
        elif instance.cleaning_algorithm == 'default':
            # TODO: The current way to remove in-string commas is tuned for OpenCitations data with yasgui style CSV. Make a generic version by using a while loop (see commented out draft below).
            # clean commas that occur in entry field values (i.e., within strings)
            imported_string_cleaned = re.sub(' ,', '_-_-_',
                                             imported_string_raw)
            imported_string_cleaned = re.sub(', ', '-',
                                             imported_string_cleaned)
            imported_string_cleaned = re.sub('_-_-_', ' ,',
                                             imported_string_cleaned)

            # clean CSV file from double quotes
            imported_string_cleaned = re.sub(' "|" ', '',
                                             imported_string_cleaned)

            # clean from characters and patterns that are generally problematic for parsing operations
            imported_string_cleaned = String(imported_string_cleaned)
            imported_string_cleaned.purify(
                clean_from_non_ascii_characters=True,
                remove_problematic_patterns=True,
                clean_newline_characters=False)
            imported_string_cleaned = str(imported_string_cleaned)

            # # Draft while loop for a more generic future algorithm to replace in-string commas:
            #
            # between_quotes = False
            # for i, each_character in enumerate(imported_string_cleaned):
            #
            #    if between_quotes:
            #        if each_character == ",":
            #            imported_string_cleaned[i] = "-"
            #            print(imported_string_cleaned)
            #
            #    # first occurrence
            #    if each_character == '\"' and not between_quotes:
            #        between_quotes = True
            #    elif each_character == '\"' and between_quotes:
            # importCleanedCsvbetween_quotes = False
        # if the cleaning_algorithm parameter is not recognized, return error
        else:
            raise ValueError('Unknown algorithm type: ' +
                             instance.cleaning_algorithm +
                             '. Please enter a valid algorithm string.')

        # close the original csv file (no changes made to it)
        imported_file_raw.close()

        # create a temporary file to hold the cleaned csv file (a file is needed for csv() function)
        cleaned_file_path = "temp_cleaned.csv"
        cleaned_csv_file = open(cleaned_file_path, mode="w", encoding="utf8")
        cleaned_csv_file.write(imported_string_cleaned)
        cleaned_csv_file.close()

        # read from the temporary file and tokenize it
        cleaned_csv_file = open(cleaned_file_path, mode="r", encoding="utf8")
        cleaned_csv_file_content = list(
            csv.reader(cleaned_csv_file,
                       delimiter=instance.csv_delimiter_character))
        cleaned_csv_file.close()

        # remove the temporary file
        os_remove('temp_cleaned.csv')

        return cleaned_csv_file_content