def tokenize_string_column(self, delimiter_pattern_in_literal_cells, string_column_name, id_column_name=None): """ Tokenizes string literals by assigning one splitted part (e.g., keyword) per row. The input dataframe must have at most two columns: one column containing strings to be tokenized, and one column contaning ids(optional) Args: delimiter_pattern_in_literal_cells(str) string_column_name(str) id_column_name(str) Returns: Data_Frame (updated self) Examples: >>> import pandas >>> #===== TOKENIZING A SINGLE-COLUMN DATAFRAME ============================================================ >>> # Make a single-column dataframe: >>> df = pandas.DataFrame({'the only column': ('a; b', 'c; d; e')}) >>> my_Data_Frame = Data_Frame(df) >>> print(my_Data_Frame.dataframe) the only column 0 a; b 1 c; d; e >>> # Tokenize strings in single-column dataframe >>> my_Data_Frame.tokenize_string_column(string_column_name='the only column', ... delimiter_pattern_in_literal_cells='; ')\ .dataframe the only column 0 a 1 b 2 c 3 d 4 e >>> #===== TOKENIZING A TWO-COLUMN DATAFRAME =============================================================== >>> # Create a simple dataframe >>> my_dataframe = pandas.DataFrame({ ... 'literal_column':['literal one; literal two', 'literal three; literal four'], ... 'id_column': ['id 1', 'id 2'] ... }) >>> # Tokenize and view the dataframe >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column', ... id_column_name='id_column', ... delimiter_pattern_in_literal_cells='; ')\ .dataframe id_column literal_column 0 id 1 literal one 1 id 1 literal two 2 id 2 literal three 3 id 2 literal four >>> #===== TOKENIZING WITH REMOVAL OF SPACES BEFORE/AFTER TOKENS =========================================== >>> # Unwanted spaces occur when a single character (e.g., ',') is provided as delimiter instead of ('; '). >>> # Create a simple dataframe >>> my_dataframe = pandas.DataFrame({ ... 'literal_column':['literal one ; literal two', 'literal three; literal four '], ... 'id_column': ['id 1', 'id 2'] ... }) >>> # Tokenize and view the dataframe >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column', ... id_column_name='id_column', ... delimiter_pattern_in_literal_cells=';')\ .dataframe id_column literal_column 0 id 1 literal one 1 id 1 literal two 2 id 2 literal three 3 id 2 literal four >>> #===== TOKENIZING IN CASES WHERE DELIMITERS ARE AT HEAD AND TAIL ======================================= >>> # Create a simple dataframe >>> my_dataframe = pandas.DataFrame({ ... 'literal_column':['tail issue a; tail issue b;', ';head issue a; head issue b', ';both issues a; both issues b;', 'no issues a; no issues b'], ... 'id_column': ['id 1', 'id 2', 'id 3', 'id 4'] ... }) >>> # Tokenize and view the dataframe >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column', ... id_column_name='id_column', ... delimiter_pattern_in_literal_cells='; ')\ .dataframe id_column literal_column 0 id 1 tail issue a 1 id 1 tail issue b 2 id 2 head issue a 3 id 2 head issue b 4 id 3 both issues a 5 id 3 both issues b 6 id 4 no issues a 7 id 4 no issues b >>> #===== TOKENIZING IN CASES WHERE DELIMITER(S) ARE THE ENTIRE STRING===================================== >>> # Create a simple dataframe >>> my_dataframe = pandas.DataFrame({ ... 'literal_column':[';;;', ';;', ';', '; ;', 'non-problematic a; non-problematic b'], ... 'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id 5'] ... }) >>> # Tokenize and view the dataframe >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.tokenize_string_column(string_column_name='literal_column', ... id_column_name='id_column', ... delimiter_pattern_in_literal_cells='; ')\ .dataframe id_column literal_column 0 id 5 non-problematic a 1 id 5 non-problematic b >>> #===== A REAL WORLD TWO-COLUMN EXAMPLE ================================================================= >>> # Create a dataframe >>> my_dataframe = pandas.DataFrame({'wosKeywords': ['Clinical Neurology; Orthopedics', 'Biology; Mathematical & Computational Biology', 'Physics, Nuclear', 'Plant Sciences'], ... 'articleId': ['wosres:WOS_000071013000007', 'wosres:WOS_000071018600001', 'wosres:WOS_000071021600006', 'wosres:WOS_000071040300005']}) >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.dataframe articleId wosKeywords 0 wosres:WOS_000071013000007 Clinical Neurology; Orthopedics 1 wosres:WOS_000071018600001 Biology; Mathematical & Computational Biology 2 wosres:WOS_000071021600006 Physics, Nuclear 3 wosres:WOS_000071040300005 Plant Sciences >>> # Tokenize the string column >>> my_Data_Frame.tokenize_string_column(string_column_name='wosKeywords', ... id_column_name='articleId', ... delimiter_pattern_in_literal_cells='; ')\ .dataframe articleId wosKeywords 0 wosres:WOS_000071013000007 Clinical Neurology 1 wosres:WOS_000071013000007 Orthopedics 2 wosres:WOS_000071018600001 Biology 3 wosres:WOS_000071018600001 Mathematical & Computational Biology 4 wosres:WOS_000071021600006 Physics, Nuclear 5 wosres:WOS_000071040300005 Plant Sciences >>> #===== ERROR: DATAFRAME HAS TOO MANY COLUMNS =========================================================== >>> # Create a simple dataframe >>> my_dataframe = pandas.DataFrame({ ... 'literal_column':['literal one; literal two', 'literal three; literal four'], ... 'id_column': ['id 1', 'id 2'], ... 'third_column': ['abc', 'xyz'] ... }) >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.dataframe id_column literal_column third_column 0 id 1 literal one; literal two abc 1 id 2 literal three; literal four xyz >>> # Error: The input dataframe has too many columns: >>> try: my_Data_Frame.tokenize_string_column(string_column_name='literal_column', ... id_column_name='id_column', ... delimiter_pattern_in_literal_cells='; ') ... except IndexError as exception: # catch exception ... print (exception) 'tokenize_string_column' method can only take a Pandas.DataFrame with two columns. The current number of columns is 3. """ import pandas from preprocessor.string_tools import String number_of_columns = self.dataframe.shape[1] if number_of_columns > 2: raise IndexError( "'tokenize_string_column' method can only take a Pandas.DataFrame with two columns. " "The current number of columns is %s." % number_of_columns) # get index positions of columns index_of_literal_column = self.dataframe.columns.get_loc( string_column_name) if id_column_name: index_of_id_column = self.dataframe.columns.get_loc(id_column_name) # tokenize literals at row level literal_column = self.dataframe[string_column_name] splitted_literal_column = literal_column.str.split( delimiter_pattern_in_literal_cells) # update the column self.dataframe[string_column_name] = splitted_literal_column # create blank dataframe for output original_column_names = list(self.dataframe.columns) output_dataframe = pandas.DataFrame(columns=original_column_names) # create a new row for each tokenized literal for each_row_number, each_row in self.dataframe.iterrows(): column_names = each_row.index.values row_values = each_row.values for each_literal in row_values[index_of_literal_column]: # Clean from unwanted spaces at head and tail of tokens each_literal = String(each_literal) each_literal.clean_head_and_tail_iteratively_from_characters( ' ') each_literal.clean_head_and_tail_iteratively_from_characters( delimiter_pattern_in_literal_cells) each_literal = str(each_literal) if len( each_literal ) > 0: # do not allow empty rows to be part of the output dataframe if id_column_name: output_dataframe.loc[len(output_dataframe)] = ( row_values[index_of_id_column], each_literal) else: output_dataframe.loc[len(output_dataframe)] = ( each_literal) else: pass self.dataframe = output_dataframe return self
def purify_column(self, target_column_name): """ Cleans the specified column from undesirable characters. Args: target_column_name(str): Column to be cleaned Returns: Data_Frame (updated self) Examples: >>> import pandas >>> # CLEAN A COLUMN ======================================================================================= >>> # Create Data_Frame >>> my_dataframe = pandas.DataFrame({ ... 'dirty_column':['{string} & one','String, "two"','[string] - three','(string) /\ four;'], ... 'id_column': ['id 1', 'id 2', 'id 3', 'id 4'], ... 'another_column': ['abc', 'mno', 'pqr', 'xyz']}) >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.dataframe another_column dirty_column id_column 0 abc {string} & one id 1 1 mno String, "two" id 2 2 pqr [string] - three id 3 3 xyz (string) /\ four; id 4 >>> # Clean the column >>> my_Data_Frame.purify_column('dirty_column')\ .dataframe another_column dirty_column id_column 0 abc string and one id 1 1 mno String, two id 2 2 pqr string - three id 3 3 xyz string four id 4 >>> #======================================================================================================= >>> # EXCEPTION: COLUMN MUST CONSIST OF STRINGS ============================================================ >>> # Create a column that is made of integers >>> my_dataframe = pandas.DataFrame({ ... 'integer_column':[1, ... 2, ... 3, ... 4 ... ] ... }) >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.dataframe integer_column 0 1 1 2 2 3 3 4 >>> # Fail to purify integer column >>> try: ... my_Data_Frame.purify_column('integer_column') ... except Exception as exception: # catch exception ... print (exception) The target column "integer_column" must be of dtype "object". It is currently of dtype "int64". >>> #======================================================================================================= """ from preprocessor.string_tools import String target_column = self.dataframe[target_column_name] # Target column must be made of strings self._force_column_type(target_column_name=target_column_name, dtype='object') # 'O' stands for 'object' # a string columns is categorized as 'object' conversion_dictionary = { '/': '', ';': '', # sometimes, a semicolon seems to be at the end of keywords (e.g.,instead of "kw1; kw2; kw3" # "kw1; kw2; kw3;") '&': 'and', '\(|\)': '', # () '\[|\]': '', # [] '\{|\}': '', # {} ' ': ' ' # clean from double spaces (may occur after cleaning other characters) } # Purify each string in the column for i, each_item in enumerate(target_column): each_String = String(each_item) each_String.purify(clean_from_non_ascii_characters=True, remove_problematic_patterns=True, clean_newline_characters=True) each_String.replace_patterns(conversion_dictionary) target_column.loc[i] = each_String.content return self
def clean_heads_and_tails_of_cells_in_column_from_patterns( self, target_column_name, patterns_to_remove, location): """ Cleans the specified strings in the column from specified characters at the heads, tails (or at both locations). Args: target_column_name(str): Column to be cleaned patterns_to_remove(list): A list of strings containing patterns to remove. Keyword Args: head (patterns_to_remove): Cleans the beginning of the string from specified patterns tail (patterns_to_remove): Cleans the end of the string ends (patterns_to_remove): Cleans both the beginning and end of the string Returns: Data_Frame (updated self) Examples: >>> # INIT ================================================================================================= >>> # Create Data_Frame >>> import pandas as pd >>> my_dataframe = pd.DataFrame({ ... 'dirty_column':[';head issue','tail issue;',';both issues;',';complex situation; head', ... 'complex situation; tail;', ';complex situation; both;'], ... 'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id5', 'id6'], ... 'another_column': ['abc', 'def', 'mno', 'pqr', 'stu', 'xyz']}) >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.dataframe another_column dirty_column id_column 0 abc ;head issue id 1 1 def tail issue; id 2 2 mno ;both issues; id 3 3 pqr ;complex situation; head id 4 4 stu complex situation; tail; id5 5 xyz ;complex situation; both; id6 # CLEAN HEAD =============================================================================================== >>> # Clean the heads of strings (without touching the same pattern elsewhere) >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column', [';'], 'head')\ .dataframe another_column dirty_column id_column 0 abc head issue id 1 1 def tail issue; id 2 2 mno both issues; id 3 3 pqr complex situation; head id 4 4 stu complex situation; tail; id5 5 xyz complex situation; both; id6 # CLEAN TAIL =============================================================================================== >>> # Clean the tails of strings (without touching the same pattern elsewhere) >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column', [';'], 'tail')\ .dataframe another_column dirty_column id_column 0 abc head issue id 1 1 def tail issue id 2 2 mno both issues id 3 3 pqr complex situation; head id 4 4 stu complex situation; tail id5 5 xyz complex situation; both id6 # CLEAN BOTH =============================================================================================== >>> # Recreate Data_Frame >>> import pandas as pd >>> my_dataframe = pd.DataFrame({ ... 'dirty_column':[';head issue','tail issue;',';both issues;',';complex situation; head', ... 'complex situation; tail;', ';complex situation; both;'], ... 'id_column': ['id 1', 'id 2', 'id 3', 'id 4', 'id5', 'id6'], ... 'another_column': ['abc', 'def', 'mno', 'pqr', 'stu', 'xyz']}) >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.dataframe another_column dirty_column id_column 0 abc ;head issue id 1 1 def tail issue; id 2 2 mno ;both issues; id 3 3 pqr ;complex situation; head id 4 4 stu complex situation; tail; id5 5 xyz ;complex situation; both; id6 >>> # Clean both the heads and tails of strings (without touching the same pattern elsewhere) >>> # Note that when the target it 'both', removal proceeds ONLY if the pattern exists at both head and tail >>> my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('dirty_column', ... patterns_to_remove=[';'], ... location='both')\ .dataframe another_column dirty_column id_column 0 abc ;head issue id 1 1 def tail issue; id 2 2 mno both issues id 3 3 pqr ;complex situation; head id 4 4 stu complex situation; tail; id5 5 xyz complex situation; both id6 >>> # EXCEPTION: COLUMN MUST CONSIST OF STRINGS ============================================================ >>> # Create a column that is made of integers >>> my_dataframe = pd.DataFrame({ ... 'integer_column':[1, ... 2, ... 3, ... 4 ... ] ... }) >>> my_Data_Frame = Data_Frame(my_dataframe) >>> my_Data_Frame.dataframe integer_column 0 1 1 2 2 3 3 4 >>> # Fail to clean integer column from characters >>> try: ... my_Data_Frame.clean_heads_and_tails_of_cells_in_column_from_patterns('integer_column', [';'], 'head') ... except Exception as exception: # catch exception ... print (exception) The target column "integer_column" must be of dtype "object". It is currently of dtype "int64". >>> #======================================================================================================= """ from preprocessor.string_tools import String target_column = self.dataframe[target_column_name] # Target column must be made of strings self._force_column_type(target_column_name=target_column_name, dtype='object') # 'O' stands for 'object' # a string columns is categorized as 'object' # Clean each string in the column from the specified characters for i, each_item in enumerate(target_column): each_String = String(each_item) each_String.clean_head_and_tail_from_patterns( patterns_to_remove=patterns_to_remove, location=location) target_column.loc[i] = each_String.content return self
############################################################### my_row = CSV_Line(' "a" , "b" , "c" ,').clean_from_newline_characters().\ clean_head_and_tail_from_patterns(' ', 'head').\ clean_head_and_tail_from_patterns(' ,', 'tail').\ parse_line_and_CONVERT_to_CSV_Row(' , ').\ clean_cell_heads_and_tails_from_characters('"') # usage of chain methods/fluid interface is optional print(my_row) my_row.format_for_print_and_CONVERT_to_CSV_Line(column_separator=' , ', line_head=' ', line_tail=' ,', cell_wrapper='"') String('-----STRING=======').clean_head_and_tail_iteratively_from_characters( '-=') String('ABC123').clip_at_index(4, remove='tail') ########################################################## ####################### CSV EXPORT ####################### ########################################################## demo_bibliography = Bibliography() demo_bibliography.importBibtex('examples/example_data/IDR_Literature_WOS.bib') demo_bibliography.exportToCsv( output_file_path='examples//example_data//demo_output.csv', columns_to_ignore=[ 'b_document', 'b_authors', 'b_topics', 'b_journal', 'b_publication_month', 'b_issue_number', 'b_volume', 'b_pages', 'b_pure_bibliography_id' ],
""" Retrieves all articles from OpenCitations that has the same DOI with the records in VU and UvA bibliographies. """ # parse list from file (probably exists in ListData) from retriever.sparql_tools import Open_Citations_Query from meta.consoleOutput import ConsoleOutput from preprocessor.string_tools import String console = ConsoleOutput('log.txt') doi_list = [] with open('Input//all_dois_in_uva_and_vu_bibliographies.csv', encoding='utf8') as doi_file: for each_line in doi_file: each_line = String(each_line) each_line.clean_from_newline_characters() doi_list.append(str(each_line)) oc_query = Open_Citations_Query() oc_query.retrieve_articles_by_dois(doi_list, show_progress_bar=True) oc_query.write_results_to_csv('Input//oc_articles_with_matching_dois_v1.3.csv') # A demo list with 100 DOIs # doi_list = ['10.1163/187607508X384689', '10.1017/S0954579416000572', '10.1007/s11562-016-0353-7', '10.1016/j.adolescence.2016.09.008', '10.1186/s13561-016-0122-6', '10.1007/s00799-016-0182-6', '10.5194/gmd-2016-266', '10.1007/s00737-015-0531-2', '10.1103/RevModPhys.88.021003', 'https://doi.org/10.1101/167171', 'https://doi.org/10.1016/j.chb.2017.04.047', '10.1016/j.trb.2016.09.005', '10.1016/j.ancene.2016.01.001', '10.1111/adb.12322', '10.1017/njg.2016.45', '10.1080/1359432X.2016.1209489', '10.1117/1.JBO.21.6.066008', '10.5194/gmd-10-3329-2017', '10.1016/j.rser.2017.01.103', '10.1177/2050157916664559', '10.1007/978-3-319-45931-8_17', '10.1007/s11136-015-1171-8', '10.1145/2991079.2991121', '10.1093/cz/zow089', '10.1126/science.aac8167', '10.1007/s00586-016-4606-1', '10.1186/s12937-017-0229-6', '10.1007/s11357-016-9894-1', '10.1080/00130095.2015.1094371', '10.1016/j.epsl.2016.02.028', '10.1371/journal.pone.0168636', '10.1016/j.atmosres.2016.03.016', '10.1111/deci.12206', '10.1126/science.aad9634', '10.1103/PhysRevA.94.012506', '10.4103/0019-5545.196846', '10.1016/j.cedpsych.2017.01.006', '10.3324/haematol.2015.133470', '10.1057/978-1-137-50956-7', '10.1016/j.scico.2016.04.001', 'https://doi.org/10.1016/j.scico.2016.04.001', '10.1080/03081087.2015.1053425', '10.3758/s13423-017-1270-3', '10.1681/ASN.2015030287', '10.1016/j.avb.2016.05.006', '10.1177/0971333616689191', '10.1002/sej.1243', '10.1016/j.foreco.2017.06.023', '10.1103/PhysRevLett.118.071801', 'https://doi.org/10.1093/geront/gnv127', '10.1007/978-3-319-42324-1_16', '10.1109/JBHI.2015.2412656', '10.1016/j.jeem.2016.04.002', '10.1080/00207543.2015.1058982', '10.1038/mp.2016.100', '10.1080/03003930.2016.1194267', '10.1016/j.envint.2017.01.018', '10.1038/pr.2015.179', '10.1177/1753193416669263', '10.1016/j.tre.2016.11.003', '10.1021/acs.jpcc.5b12016', '10.1002/anie.201603510', '10.1073/pnas.1607005113', '(DOI) - 10.1111/cch.12521', '10.1017/S0016756815000886', '10.1080/1350293X.2015.1073507', '10.1152/jn.00701.2015', '10.1371/journal.pone.0170791', '10.1016/j.seares.2016.07.005', '10.1016/j.reseneeco.2016.03.003', '10.1007/s00531-017-1499-0', '10.1007/s41669-017-0014-7', '10.1093/acrefore/9780190228613.013.439', '10.14814/phy2.13201', '10.1016/j.jtrangeo.2016.10.013', '10.1523/JNEUROSCI.3658-16.2017', '10.1192/bjpo.bp.115.000166', '10.1136/bmjgh-2016-000109', '10.7554/eLife.20320.001', '10.1037/pas0000332', '10.1177/1474704916673841', '10.1057/978-1-137-58179-2', '10.1002/ejp.963', '10.1017/thg.2016.78', '10.1038/tpj.2016.32', '10.1016/j.jesp.2017.03.008', '10.1287/trsc.2015.0647', '10.1186/s13015-016-0087-3', '10.1016/j.neuroimage.2016.10.030', '10.1371/journal.pone.0169109', '10.1007/s11367-017-1358-z', '10.1080/1369183X.2015.1061425', '10.2196/mental.4614', '10.1002/arp.1564', '10.1021/acs.orglett.6b01023', '10.3847/1538-4357/aa6c47', 'http://www.socialevraagstukken.nl/veiligheid-creeer-je-met-geborgenheid/', '10.1186/s12888-016-0790-0', '10.1371/journal.pone.0155755', '10.1103/PhysRevLett.116.241801']
def get_line_at_position_from_file(self, line_number): """ Returns a specified line from the TextFile without reading the whole file into memory. Args: line_index(int): A value that can take integers starting from 0. Returns: String class object (created from string at line in file). See Also: CSV_File.get_line_at_position_from_file Examples: >>> # return first line of file >>> my_file = Text_File('test_data//example_merged_yasgui_1000.csv') >>> my_file.get_line_at_position_from_file(1) '"publication_type" , "journal_article" , "title" , "publication_year" , "author_name" , "journal_name" , "journal_issue_number" , "journal_volume_number" , "startEndPages" , "publisher_name" , "doi" , "cited_by_article" ,' >>> # return another line >>> my_file.get_line_at_position_from_file(122) '"Journal Article" , "https://w3id.org/oc/corpus/br/3448" , "Perioperative Myocardial Infarction" , "2009" , "Beattie - W. S. | Mosseri - M. | Jaffe - A. S. | Alpert - J. S." , "Circulation" , "22" , "119" , "2936--2944" , "Ovid Technologies (Wolters Kluwer Health)" , "10.1161/circulationaha.108.828228" , "https://w3id.org/oc/corpus/br/3426" ,' >>> # return last line >>> my_file.get_line_at_position_from_file(267) '"Journal Article" , "https://w3id.org/oc/corpus/br/3437" , "Myocardial Injury after Noncardiac Surgery" , "2014" , "Niebrzegowska - Edyta | Benton - Sally | Wragg - Andrew | Archbold - Andrew | Smith - Amanda | McAlees - Eleanor | Ramballi - Cheryl | MacDonald - Neil | Januszewska - Marta | Shariffuddin - Ina I. | Vasanthan - V. | Hashim - N. H. M. | Undok - A. Wahab | Ki - Ushananthini | Lai - Hou Yee | Ahmad - Wan Azman | Ackland - Gareth | Khan - Ahsun | Almeida - Smitha | Cherian - Joseph | Furruqh - Sultana | Abraham - Valsa | Paniagua - Pilar | Urrutia - Gerard | Maestre - Mari Luz | Santaló - Miquel | Gonzalez - Raúl | Font - Adrià | Martínez - Cecilia" , "Anesthesiology" , "3" , "120" , "564--578" , "Ovid Technologies (Wolters Kluwer Health)" , "10.1097/aln.0000000000000113" , "https://w3id.org/oc/corpus/br/3522 | https://w3id.org/oc/corpus/br/300243 | https://w3id.org/oc/corpus/br/3062326 | https://w3id.org/oc/corpus/br/3271454 | https://w3id.org/oc/corpus/br/3879533 | https://w3id.org/oc/corpus/br/4205354 | https://w3id.org/oc/corpus/br/5253819 | https://w3id.org/oc/corpus/br/6332120 | https://w3id.org/oc/corpus/br/7799424 | https://w3id.org/oc/corpus/br/8003885 | https://w3id.org/oc/corpus/br/8185544" ,' >>> # erroneous index number entered (0) >>> # return first line of file >>> my_file = Text_File('test_data//example_merged_yasgui_1000.csv') >>> try: ... my_file.get_line_at_position_from_file(0) # line_number cannot be 0 ... except Exception as error_message: ... print('Exception: ' + str(error_message)) Exception: Parameter value must be a positive integer but is "0" of <class 'int'>. >>> # erroneous index number entered (too high) >>> try: ... my_file.get_line_at_position_from_file(300) # there is no 300th line in the file ... except IndexError as error_message: ... print('Exception: ' + str(error_message)) Exception: Requested line number '300' does not exist in file. """ from preprocessor.string_tools import String, Parameter_Value Parameter_Value(line_number).force_positive_integer() with open(self.input_file_path, encoding='utf8') as input_file: line = None for i, each_line in enumerate(input_file): current_iteration_step = i + 1 # to align index numbers (starting from 0) and line numbers (start from 1) if current_iteration_step == line_number: line = String(each_line) elif current_iteration_step > line_number: break if line == None: raise IndexError( "Requested line number '%s' does not exist in file." % line_number) # if not cleaned from '\n', comparisons and operations tend to be problematic # write to file with base print() function to get back the new line in the end line.clean_from_newline_characters() return line
def is_each_row_balanced(self, exclude_special_rows_of_syntax=None): """ Checks whether each row in buffer is balanced (i.e., does not have unmatched parantheses, brackets, etc). Can exclude special row types (e.g., comment) from evaluation. Args: exclude_special_rows_of_syntax(str): specifies what type of rows to exclude from evaluation (e.g., comment rows). Uses predefined syntax settings per specified syntax (e.g., 'bibtex'). Keyword Args: - bibtex (exclude_special_rows_of_syntax): sets evaluation exclusion criteria for bibtex syntax Returns: boolean Examples: >>> # an unbalanced row is present >>> my_buffer = ListBuffer() >>> my_buffer.append_row(['a', 'b', 'c']).append_row(['d', 'e', 'f']).dataset [['a', 'b', 'c'], ['d', 'e', 'f']] >>> my_buffer.append_row(['g', 'h' , '>'])\ .is_each_row_balanced() False >>> # single row from a bib file >>> my_buffer = ListBuffer() >>> my_buffer.append_row(' year = "2017",')\ .is_each_row_balanced() True >>> # bibtex entry start (no exception vs. exception) >>> my_buffer.append_row('@article{96d9add3e2f44e8abbf030170689bc30,')\ .is_each_row_balanced() False >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex') True >>> # bibtex comment (no exception vs. exception) >>> my_buffer = ListBuffer() >>> my_buffer.append_row('% This is a comment with an unbalanced characters }]>')\ .is_each_row_balanced() False >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex') True >>> # a full bibtex entry with an unbalanced curly bracket at title field >>> my_buffer = ListBuffer() >>> my_buffer.dataset = ['@book{a82caf00e1a143759c7f5543b6c84ea5,', 'title = "{Knowledge Representation for Health Care (AIME 2015 International Joint Workshop, KR4HC/ProHealth 2015)",', 'author = "D Riano and R. Lenz and S Miksch and M Peleg and M. Reichert and {ten Teije}, A.C.M.",', 'year = "2015",', 'doi = "10.1007/978-3-319-26585-8",', 'isbn = "9783319265841",', 'series = "LNAI",', 'publisher = "Springer",', 'number = "9485",', '}', ''] >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex') # error False >>> # the same entry with unbalanced curly bracket removed >>> my_buffer.dataset = ['@book{a82caf00e1a143759c7f5543b6c84ea5,', 'title = "Knowledge Representation for Health Care (AIME 2015 International Joint Workshop, KR4HC/ProHealth 2015)",', 'author = "D Riano and R. Lenz and S Miksch and M Peleg and M. Reichert and {ten Teije}, A.C.M.",', 'year = "2015",', 'doi = "10.1007/978-3-319-26585-8",', 'isbn = "9783319265841",', 'series = "LNAI",', 'publisher = "Springer",', 'number = "9485",', '}', ''] >>> my_buffer.is_each_row_balanced(exclude_special_rows_of_syntax='bibtex') True """ from preprocessor.string_tools import String buffer = self.dataset is_balanced_log = [] for each_row in buffer: each_row = String(str(each_row)) if not each_row.is_balanced(): # print('row is not balanced: ', each_row) ### EXCLUSIONS FOR BIBTEX ########################################### if exclude_special_rows_of_syntax == 'bibtex': # print('special syntax = bibtex recognized') # forgive these row types if each_row.is_line_type('bibtex', 'start of entry') \ or each_row.is_line_type('bibtex', 'end of entry') \ or each_row.is_line_type('bibtex', 'comment'): is_balanced_log.append(True) # print("01: appended True to log, because the row is unbalanced but it passed exclusion rules", "the current row (each_row) is: ", "(", type(each_row) ,")", each_row) else: is_balanced_log.append(False) ###################################################################### else: is_balanced_log.append(False) # print("02: appended False to log because row is unbalanced (no exclusion keyword specified) ", "the current row (each_row) is: ", "(", type(each_row) ,")", each_row) else: is_balanced_log.append(True) # print("03: appended True to log because row is balanced ", "the current row (each_row) is: ", "(", type(each_row) ,")", each_row) if False in is_balanced_log: return False else: return True
def cleanAndTokenizeCsv(instance): """ Imports the .csv file as raw text, cleans it (if cleaning algorithm is specified), and then tokenizes it. Returns: List containing parsed data from the .csv file. For each row in the .csv file (including headers row), a sub-list is created in the main list. Examples: >>> from preprocessor.Text_File import Text_File >>> my_file = Text_File('example_data//problematic_yasgui_csv_file.csv') >>> my_file.print_lines(2) "27624462" , "2016" , "Journal Article" , "Duku - Stephen Kwasiââ OpokuĂŠ | Asenso-Boadi - Francis" , "[]{}\ '<Utilization, of, healthcare services and renewal of health insurance membership: evidence of adverse selection in Ghana" , "Springer Science + Business Media" , "1" , "Health Econ Rev - Health Economics Review" , "http://dx.doi.org/10.1186/s13561-016-0122-6" , "6" , "10.1186/s13561-016-0122-6" , "" , "https://w3id.org/oc/corpus/br/3555801" , "https://w3id.org/oc/corpus/br/18754 | https://w3id.org/oc/corpus/br/18792" , >>> my_csv_bibliography = CSV_Bibliography( ... csv_file_path='example_data//problematic_yasgui_csv_file.csv', ... id_column_header='journal_article', ... field_value_list_separator=' | ', ... csv_delimiter_character=',', ... cleaning_algorithm='default' ... ) Conversion from ListData to Bibliography object started Conversion completed. 2 out of 2 ListData rows converted to Bibliography object entries >>> my_csv_bibliography.preview(1) # notice the character conversions in the 'authors' and 'title' fields <BLANKLINE> ----------------------------------ENTRY 1---------------------------------- ('https://w3id.org/oc/corpus/br/3555801', {'': '', 'authors': ['Duku - Stephen Kwasiaa OpokuAS', 'Asenso-Boadi - Francis'], 'cited_by_the_articles': '', 'cited_the_articles': ['https://w3id.org/oc/corpus/br/18754', 'https://w3id.org/oc/corpus/br/18792'], 'doi': '10.1186/s13561-016-0122-6', 'journal_article': 'https://w3id.org/oc/corpus/br/3555801', 'journal_issue_number': '1', 'journal_name': 'Health Econ Rev - Health Economics Review', 'journal_volume_number': '6', 'pmid': '27624462', 'publication_type': 'Journal Article', 'publication_year': '2016', 'publisher_name': 'Springer Science + Business Media', 'title': ' Utilization-of-healthcare services and renewal of health ' 'insurance membership: evidence of adverse selection in Ghana', 'url': 'http://dx.doi.org/10.1186/s13561-016-0122-6'}) <BLANKLINE> """ import re import csv from os import remove as os_remove from preprocessor.string_tools import String # open the csv file and read it to a variable imported_file_raw = open(instance.csv_file_path, mode="r", encoding="utf8") imported_string_raw = imported_file_raw.read() # if no cleaning algorithm is specified, skip cleaning and just tokenize if instance.cleaning_algorithm == 'parse only': imported_string_cleaned = imported_string_raw # otherwise, run cleaning algorithm elif instance.cleaning_algorithm == 'default': # TODO: The current way to remove in-string commas is tuned for OpenCitations data with yasgui style CSV. Make a generic version by using a while loop (see commented out draft below). # clean commas that occur in entry field values (i.e., within strings) imported_string_cleaned = re.sub(' ,', '_-_-_', imported_string_raw) imported_string_cleaned = re.sub(', ', '-', imported_string_cleaned) imported_string_cleaned = re.sub('_-_-_', ' ,', imported_string_cleaned) # clean CSV file from double quotes imported_string_cleaned = re.sub(' "|" ', '', imported_string_cleaned) # clean from characters and patterns that are generally problematic for parsing operations imported_string_cleaned = String(imported_string_cleaned) imported_string_cleaned.purify( clean_from_non_ascii_characters=True, remove_problematic_patterns=True, clean_newline_characters=False) imported_string_cleaned = str(imported_string_cleaned) # # Draft while loop for a more generic future algorithm to replace in-string commas: # # between_quotes = False # for i, each_character in enumerate(imported_string_cleaned): # # if between_quotes: # if each_character == ",": # imported_string_cleaned[i] = "-" # print(imported_string_cleaned) # # # first occurrence # if each_character == '\"' and not between_quotes: # between_quotes = True # elif each_character == '\"' and between_quotes: # importCleanedCsvbetween_quotes = False # if the cleaning_algorithm parameter is not recognized, return error else: raise ValueError('Unknown algorithm type: ' + instance.cleaning_algorithm + '. Please enter a valid algorithm string.') # close the original csv file (no changes made to it) imported_file_raw.close() # create a temporary file to hold the cleaned csv file (a file is needed for csv() function) cleaned_file_path = "temp_cleaned.csv" cleaned_csv_file = open(cleaned_file_path, mode="w", encoding="utf8") cleaned_csv_file.write(imported_string_cleaned) cleaned_csv_file.close() # read from the temporary file and tokenize it cleaned_csv_file = open(cleaned_file_path, mode="r", encoding="utf8") cleaned_csv_file_content = list( csv.reader(cleaned_csv_file, delimiter=instance.csv_delimiter_character)) cleaned_csv_file.close() # remove the temporary file os_remove('temp_cleaned.csv') return cleaned_csv_file_content