Beispiel #1
0
class BianaParser(object):
    """
    General Parser Class to biana
    """
    def __init__(
            self,
            default_db_description=None,
            default_script_name="bianaParser.py",
            default_script_description="This file implements a program that fills up tables in database biana with information from distinct databases",
            #content_type_list = [],
            additional_compulsory_arguments=[],
            additional_optional_arguments=[]):
        """
        Starts the bianaParser Object
        """

        print "Parser object started"

        self.compulsory_arguments = [
            ("input-identifier=", None,
             "path or file name of input file(s) containing database data. Path names must end with \"/\"."
             ), ("biana-dbname=", None, "name of database biana to be used"),
            ("biana-dbhost=", None,
             "name of host where database biana to be used is placed"),
            ("database-name=", None,
             "internal identifier name to this database (it must be unique in the database)"
             ),
            ("database-version=", None,
             "version of the database to be inserted")
        ]

        self.compulsory_arguments.extend(additional_compulsory_arguments)

        self.optional_arguments = [
            ("biana-dbuser="******"username accessing the database (not required in most systems)"),
            ("biana-dbpass="******"password of username accessing the database (not required in most systems"
             ), ("help", None, "prints this message and exits"),
            ("verbose", 0, "prints process info to stdout"),
            ("log-file=", None,
             "Prints a log file of the parsing result (number of inserted proteins, references...)"
             ),
            ("time-control", None,
             "prints to stderr a control of the timing of the parser"),
            ("database-description=", default_db_description,
             "Description of the database to be inserted."),
            ("optimize-for-parsing", None, "Optimizes database for parsing"),
            ("promiscuous", False,
             "sets the database to be parsed as promiscuous (whose entities can be included in multi user entities)"
             )
        ]
        #("mode=","scratch","sets mode to be used by parser. Valid modes are: \"scratch\" (biana database is empty, create it from scratch) or \"tables\" (fill only tables indicated in tables_to_fill (see code)")]

        self.optional_arguments.extend(additional_optional_arguments)

        self.script_name = default_script_name
        self.script_description = default_script_description

        #Parse general methods
        self.arguments_dic = self.parseArguments()
        self.input_file = self.arguments_dic["input-identifier"]
        self.biana_dbname = self.arguments_dic["biana-dbname"]
        self.biana_dbhost = self.arguments_dic["biana-dbhost"]
        self.sourcedb_name = self.arguments_dic["database-name"]
        self.sourcedb_version = self.arguments_dic["database-version"]
        self.biana_dbuser = self.arguments_dic["biana-dbuser"]
        self.biana_dbpass = self.arguments_dic["biana-dbpass"]
        self.help = self.arguments_dic["help"]
        self.verbose = self.arguments_dic["verbose"]
        self.time_control = self.arguments_dic["time-control"]
        self.log_file = self.arguments_dic["log-file"]
        self.optimize_for_parsing = self.arguments_dic["optimize-for-parsing"]
        #self.mode = self.arguments_dic["mode"]
        self.is_promiscuous = self.arguments_dic[
            "promiscuous"]  # Flag deciding whether database gives information that is going to be added to more than one user entiries

        self.database = None
        if self.arguments_dic.has_key("default-attribute"):
            self.default_eE_attribute = self.arguments_dic[
                "default-attribute"]  # default externalEntityAttribute specified by the particular database parser (it will be overwritten in the parser if not given as argument)
        else:
            self.default_eE_attribute = ""

        #self.content_type_list = content_types
        self.attribute_identifier_to_max_value_length = {}
        for key, value in EXTERNAL_ENTITY_IDENTIFIER_ATTRIBUTES + EXTERNAL_ENTITY_VERSIONABLE_IDENTIFIER_ATTRIBUTE_TYPES + EXTERNAL_ENTITY_DESCRIPTIVE_SEARCHABLE_ATTRIBUTE_TYPES + EXTERNAL_ENTITY_DESCRIPTIVE_ATTRIBUTE_TYPES + EXTERNAL_ENTITY_NUMERIC_ATTRIBUTE_TYPES:
            value = value.lower()
            if value.startswith("varchar("):
                self.attribute_identifier_to_max_value_length[
                    key.lower()] = int(value[len("varchar("):].rstrip(")"))
            elif value.startswith("char("):
                self.attribute_identifier_to_max_value_length[
                    key.lower()] = int(value[len("char("):].rstrip(")"))
            elif value.startswith("text("):
                self.attribute_identifier_to_max_value_length[
                    key.lower()] = int(value[len("text("):].rstrip(")"))
            elif value.startswith("integer("):
                if value.endswith(" unsigned"):
                    value = value[:-len(" unsigned")]
                self.attribute_identifier_to_max_value_length[
                    key.lower()] = len(
                        str((2**8)**int(value[len("integer("):].rstrip(")"))))
            elif value.startswith("integer"):
                if value.endswith(" unsigned"):
                    value = value[:-len(" unsigned")]
                self.attribute_identifier_to_max_value_length[
                    key.lower()] = len(str((2**8)**4))
            elif value.startswith("smallint"):
                if value.endswith(" unsigned"):
                    value = value[:-len(" unsigned")]
                self.attribute_identifier_to_max_value_length[
                    key.lower()] = len(str((2**8)**2))
            elif value.startswith("text"):
                self.attribute_identifier_to_max_value_length[
                    key.lower()] = 999999

    def verify_attribute_length(self, attribute_identifier, attribute_value):
        attribute_value = attribute_value.lower()
        length = len(attribute_value)
        if length > self.attribute_identifier_to_max_value_length[
                attribute_identifier]:
            sys.stderr.write("\n%s longer than expected: %s\n" %
                             (attribute_identifier, attribute_value))
        elif length == 0:
            sys.stderr.write("\n%s has 0 length: %s\n" %
                             (attribute_identifier, attribute_value))

    def start(self):

        print "Parser started"
        if isinstance(self.sourcedb_name, int) or isinstance(
                self.sourcedb_version, int):
            sys.stderr.write(
                "You must insert correctly the database name and database version\n"
            )
            sys.exit(1)

        #if( self.mode=="scratch" ):
        self.database_description = self.arguments_dic["database-description"]

        # Log dictionary where all log information will be stored
        self.log = {}
        if self.log_file:
            self.log_file_fd = file(self.log_file, 'w')

        self.biana_access = BianaDBaccess(dbname=self.biana_dbname,
                                          dbhost=self.biana_dbhost,
                                          dbuser=self.biana_dbuser,
                                          use_buffer=True,
                                          dbpassword=self.biana_dbpass,
                                          lock_tables=True,
                                          check_integrity=True)

        # check data consistency

        # Time related
        self.initial_time = time.time()

        # Insert the information associated to the parsed database
        # Introduce database info into biana database
        #if( self.mode=="scratch" ):

        self.database = ExternalDatabase(
            databaseName=self.sourcedb_name,
            databaseVersion=self.sourcedb_version,
            databaseFile=self.input_file.split(os.sep)[-1],
            databaseDescription=self.database_description,
            defaultExternalEntityAttribute=self.default_eE_attribute,
            isPromiscuous=self.is_promiscuous)
        #content_type_list = self.content_type_list)

        self.biana_access.insert_new_external_database(
            externalDatabase=self.database)

        # Open the input file descriptor
        # This is a responsability of subclasses method

        try:
            if self.optimize_for_parsing:
                self.biana_access.optimize_database_for(mode="parsing")

            self.parse_database()

            # set the parsing time
            self.database.set_parsing_time(int(time.time() -
                                               self.initial_time))

            # Updates the information that this external database has inserted
            self.biana_access.update_external_database_external_entity_attributes(
                self.database)

            self.close()

        except:
            traceback.print_exc()
            sys.stderr.write(
                "ERROR WHILE PARSING. ALL MODIFICATIONS ARE GOING TO BE DELETED\n"
            )
            self.biana_access._rollback()
            sys.exit(1)

    # METHODS

    def close(self):
        ## LAST STEP: CLOSE DATABASE CONNECTION    IMPORTANT !!!!
        ## As bianaDBaccess uses an internal buffer, it is necessary to close the connection to sure that all inserts are correctly done, as well as unlock tables

        self.biana_access.close()

        if self.time_control:
            sys.stderr.write("Total time: %s seconds\n" %
                             (time.time() - self.initial_time))

        if self.log_file:
            self.log_file_fd.write(self.get_log_string())
            self.log_file_fd.close()

        if self.verbose:
            sys.stderr.write("\n Total time: %s \n" %
                             (time.time() - self.initial_time))
            sys.stderr.write(self.get_log_string())

    ## GENERAL PARSER METHODS ##
    def parseArguments(self):
        """
        Method that returns a dictionary with the values of the arguments
        
        """

        arguments = self.compulsory_arguments + self.optional_arguments

        # Set all default values
        #return_values = [i[1] for i in arguments]
        return_dict = {}
        for i in arguments:
            return_dict[i[0].replace("=", "")] = i[1]

        # Obtain a list with the names of all arguments
        list_arguments = [argument[0] for argument in arguments]
        # It can be of the following way because it contains "=" digit
        #list_arguments = return_dict.keys()

        # Parse arguments
        try:
            opts, args = getopt.getopt(sys.argv[2:], "", list_arguments)

        except getopt.GetoptError, bad_opt:
            # return error in parsing parameters, and return void list
            raise ValueError("%s\n" % (bad_opt.__str__()))

        # If there is no error, continue with the parsing
        for option, value in opts:
            if option == "--help":
                self.print_help()
                sys.exit(2)
            for actual_argument in list_arguments:
                # Delete the "=" value if it has
                temp_arg = actual_argument.replace("=", "")
                if option == "--" + temp_arg:
                    if value == "":
                        return_dict[temp_arg] = 1
                    else:
                        return_dict[temp_arg] = value

        # Check for all compulsory arguments:
        for comp_arg in self.compulsory_arguments:
            if return_dict[comp_arg[0].replace("=", "")] is None:
                sys.stderr.write("%s argument is not defined!\n" %
                                 (comp_arg[0].replace("=", "")))
                self.print_help()
                sys.exit(2)

        return return_dict
Beispiel #2
0
class BianaParser(object):
    """
    General Parser Class to biana
    """

    def __init__(self, default_db_description = None,
                 default_script_name = "bianaParser.py",
                 default_script_description = "This file implements a program that fills up tables in database biana with information from distinct databases",
                 #content_type_list = [],
                 additional_compulsory_arguments = [],
                 additional_optional_arguments = []):
        
        """
        Starts the bianaParser Object
        """

        print "Parser object started"

        self.compulsory_arguments = [ ("input-identifier=",None,"path or file name of input file(s) containing database data. Path names must end with \"/\"."),
                                      ("biana-dbname=",None,"name of database biana to be used"),
                                      ("biana-dbhost=",None,"name of host where database biana to be used is placed"),
                                      ("database-name=",None,"internal identifier name to this database (it must be unique in the database)"),
                                      ("database-version=",None,"version of the database to be inserted") ]

        self.compulsory_arguments.extend(additional_compulsory_arguments)
        

        self.optional_arguments = [ ("biana-dbuser="******"username accessing the database (not required in most systems)"),
                                    ("biana-dbpass="******"password of username accessing the database (not required in most systems"),
                                    ("help",None,"prints this message and exits"),
                                    ("verbose",0,"prints process info to stdout"),
                                    ("log-file=",None,"Prints a log file of the parsing result (number of inserted proteins, references...)"),
                                    ("time-control",None,"prints to stderr a control of the timing of the parser"),
                                    ("database-description=",default_db_description,"Description of the database to be inserted."),
                                    ("optimize-for-parsing",None,"Optimizes database for parsing"),
				    ("promiscuous",False,"sets the database to be parsed as promiscuous (whose entities can be included in multi user entities)") ]
                                    #("mode=","scratch","sets mode to be used by parser. Valid modes are: \"scratch\" (biana database is empty, create it from scratch) or \"tables\" (fill only tables indicated in tables_to_fill (see code)")]   
                                           
        self.optional_arguments.extend(additional_optional_arguments)

        self.script_name = default_script_name
        self.script_description = default_script_description

        #Parse general methods
        self.arguments_dic = self.parseArguments()
        self.input_file = self.arguments_dic["input-identifier"]
        self.biana_dbname = self.arguments_dic["biana-dbname"]
        self.biana_dbhost = self.arguments_dic["biana-dbhost"]
        self.sourcedb_name = self.arguments_dic["database-name"]
        self.sourcedb_version = self.arguments_dic["database-version"]
        self.biana_dbuser = self.arguments_dic["biana-dbuser"]
        self.biana_dbpass = self.arguments_dic["biana-dbpass"]
        self.help = self.arguments_dic["help"]
        self.verbose = self.arguments_dic["verbose"]
        self.time_control = self.arguments_dic["time-control"]
        self.log_file = self.arguments_dic["log-file"]
        self.optimize_for_parsing = self.arguments_dic["optimize-for-parsing"]
        #self.mode = self.arguments_dic["mode"]
	self.is_promiscuous = self.arguments_dic["promiscuous"] # Flag deciding whether database gives information that is going to be added to more than one user entiries

        self.database = None
        if self.arguments_dic.has_key("default-attribute"):
            self.default_eE_attribute = self.arguments_dic["default-attribute"] # default externalEntityAttribute specified by the particular database parser (it will be overwritten in the parser if not given as argument)
        else:
            self.default_eE_attribute = ""
        
        #self.content_type_list = content_types
	self.attribute_identifier_to_max_value_length = {}
	for key, value in EXTERNAL_ENTITY_IDENTIFIER_ATTRIBUTES + EXTERNAL_ENTITY_VERSIONABLE_IDENTIFIER_ATTRIBUTE_TYPES + EXTERNAL_ENTITY_DESCRIPTIVE_SEARCHABLE_ATTRIBUTE_TYPES + EXTERNAL_ENTITY_DESCRIPTIVE_ATTRIBUTE_TYPES + EXTERNAL_ENTITY_NUMERIC_ATTRIBUTE_TYPES:
	    value = value.lower()
	    if value.startswith("varchar("):
		self.attribute_identifier_to_max_value_length[key.lower()] = int(value[len("varchar("):].rstrip(")")) 
	    elif value.startswith("char("):
		self.attribute_identifier_to_max_value_length[key.lower()] = int(value[len("char("):].rstrip(")")) 
	    elif value.startswith("text("):
		self.attribute_identifier_to_max_value_length[key.lower()] = int(value[len("text("):].rstrip(")")) 
	    elif value.startswith("integer("):
		if value.endswith(" unsigned"):
		    value = value[:-len(" unsigned")]
		self.attribute_identifier_to_max_value_length[key.lower()] = len(str((2**8)**int(value[len("integer("):].rstrip(")"))))  
	    elif value.startswith("integer"):
		if value.endswith(" unsigned"):
		    value = value[:-len(" unsigned")]
		self.attribute_identifier_to_max_value_length[key.lower()] = len(str((2**8)**4))  
	    elif value.startswith("smallint"):
		if value.endswith(" unsigned"):
		    value = value[:-len(" unsigned")]
		self.attribute_identifier_to_max_value_length[key.lower()] = len(str((2**8)**2))
	    elif value.startswith("text"):
		self.attribute_identifier_to_max_value_length[key.lower()] = 999999


    def verify_attribute_length(self, attribute_identifier, attribute_value):
	attribute_value = attribute_value.lower()
	length = len(attribute_value)
	if length > self.attribute_identifier_to_max_value_length[attribute_identifier]:
	    sys.stderr.write("\n%s longer than expected: %s\n" % (attribute_identifier, attribute_value))
	elif length == 0:
	    sys.stderr.write("\n%s has 0 length: %s\n" % (attribute_identifier, attribute_value))

    def start(self):


        print "Parser started"
        if isinstance(self.sourcedb_name,int) or isinstance(self.sourcedb_version,int):
            sys.stderr.write("You must insert correctly the database name and database version\n")
            sys.exit(1)
            
        #if( self.mode=="scratch" ):
        self.database_description = self.arguments_dic["database-description"]

        # Log dictionary where all log information will be stored
        self.log = {}
        if self.log_file:
            self.log_file_fd = file(self.log_file, 'w')

        self.biana_access = BianaDBaccess(dbname=self.biana_dbname, dbhost=self.biana_dbhost, dbuser=self.biana_dbuser, use_buffer=True, dbpassword=self.biana_dbpass, lock_tables=True, check_integrity=True )


        # check data consistency

        # Time related
        self.initial_time = time.time()

        # Insert the information associated to the parsed database
        # Introduce database info into biana database
        #if( self.mode=="scratch" ):

        self.database = ExternalDatabase( databaseName = self.sourcedb_name,
                                          databaseVersion = self.sourcedb_version,
                                          databaseFile = self.input_file.split(os.sep)[-1],
                                          databaseDescription = self.database_description,
                                          defaultExternalEntityAttribute = self.default_eE_attribute,
					  isPromiscuous = self.is_promiscuous ) 
                                          #content_type_list = self.content_type_list)

        self.biana_access.insert_new_external_database( externalDatabase = self.database )
                                                               
        # Open the input file descriptor
        # This is a responsability of subclasses method
            
        try:
            if self.optimize_for_parsing:
            	self.biana_access.optimize_database_for(mode="parsing")

            self.parse_database()
            
            # set the parsing time
            self.database.set_parsing_time( int(time.time() - self.initial_time) )

            # Updates the information that this external database has inserted
            self.biana_access.update_external_database_external_entity_attributes( self.database )

            self.close()


        except:
            traceback.print_exc()
            sys.stderr.write("ERROR WHILE PARSING. ALL MODIFICATIONS ARE GOING TO BE DELETED\n")
            self.biana_access._rollback()
            sys.exit(1)
        

    # METHODS

    def close(self):
        ## LAST STEP: CLOSE DATABASE CONNECTION    IMPORTANT !!!!
        ## As bianaDBaccess uses an internal buffer, it is necessary to close the connection to sure that all inserts are correctly done, as well as unlock tables


        self.biana_access.close()
        
        if self.time_control:
            sys.stderr.write("Total time: %s seconds\n" %(time.time()-self.initial_time))

        if self.log_file:
            self.log_file_fd.write(self.get_log_string())
            self.log_file_fd.close()

        if self.verbose:
            sys.stderr.write("\n Total time: %s \n" %(time.time()-self.initial_time) )
            sys.stderr.write(self.get_log_string())



    ## GENERAL PARSER METHODS ##
    def parseArguments(self):
        """
        Method that returns a dictionary with the values of the arguments
        
        """
        
        arguments = self.compulsory_arguments+self.optional_arguments
        
        # Set all default values
        #return_values = [i[1] for i in arguments]
        return_dict = {}
        for i in arguments:
            return_dict[i[0].replace("=","")] = i[1]
        
        # Obtain a list with the names of all arguments
        list_arguments = [argument[0] for argument in arguments]
        # It can be of the following way because it contains "=" digit
        #list_arguments = return_dict.keys()
        
        
        # Parse arguments
        try:
            opts, args = getopt.getopt(sys.argv[2:], "", list_arguments)
            
        except getopt.GetoptError, bad_opt:
            # return error in parsing parameters, and return void list
            raise ValueError("%s\n" %(bad_opt.__str__()) )

        # If there is no error, continue with the parsing
        for option,value in opts:
            if option=="--help":
                self.print_help()
                sys.exit(2)
            for actual_argument  in list_arguments:
                # Delete the "=" value if it has
                temp_arg = actual_argument.replace("=","")
                if option=="--"+temp_arg:
                    if value=="":
                        return_dict[temp_arg]=1
                    else:
                        return_dict[temp_arg]=value

        
        # Check for all compulsory arguments:
        for comp_arg in self.compulsory_arguments:
            if return_dict[comp_arg[0].replace("=","")] is None:
                sys.stderr.write("%s argument is not defined!\n" %(comp_arg[0].replace("=","")))
                self.print_help()
                sys.exit(2)

        return return_dict
Beispiel #3
0
    def start(self):

        print "Parser started"
        if isinstance(self.sourcedb_name, int) or isinstance(
                self.sourcedb_version, int):
            sys.stderr.write(
                "You must insert correctly the database name and database version\n"
            )
            sys.exit(1)

        #if( self.mode=="scratch" ):
        self.database_description = self.arguments_dic["database-description"]

        # Log dictionary where all log information will be stored
        self.log = {}
        if self.log_file:
            self.log_file_fd = file(self.log_file, 'w')

        self.biana_access = BianaDBaccess(dbname=self.biana_dbname,
                                          dbhost=self.biana_dbhost,
                                          dbuser=self.biana_dbuser,
                                          use_buffer=True,
                                          dbpassword=self.biana_dbpass,
                                          lock_tables=True,
                                          check_integrity=True)

        # check data consistency

        # Time related
        self.initial_time = time.time()

        # Insert the information associated to the parsed database
        # Introduce database info into biana database
        #if( self.mode=="scratch" ):

        self.database = ExternalDatabase(
            databaseName=self.sourcedb_name,
            databaseVersion=self.sourcedb_version,
            databaseFile=self.input_file.split(os.sep)[-1],
            databaseDescription=self.database_description,
            defaultExternalEntityAttribute=self.default_eE_attribute,
            isPromiscuous=self.is_promiscuous)
        #content_type_list = self.content_type_list)

        self.biana_access.insert_new_external_database(
            externalDatabase=self.database)

        # Open the input file descriptor
        # This is a responsability of subclasses method

        try:
            if self.optimize_for_parsing:
                self.biana_access.optimize_database_for(mode="parsing")

            self.parse_database()

            # set the parsing time
            self.database.set_parsing_time(int(time.time() -
                                               self.initial_time))

            # Updates the information that this external database has inserted
            self.biana_access.update_external_database_external_entity_attributes(
                self.database)

            self.close()

        except:
            traceback.print_exc()
            sys.stderr.write(
                "ERROR WHILE PARSING. ALL MODIFICATIONS ARE GOING TO BE DELETED\n"
            )
            self.biana_access._rollback()
            sys.exit(1)
Beispiel #4
0
    def start(self):


        print "Parser started"
        if isinstance(self.sourcedb_name,int) or isinstance(self.sourcedb_version,int):
            sys.stderr.write("You must insert correctly the database name and database version\n")
            sys.exit(1)
            
        #if( self.mode=="scratch" ):
        self.database_description = self.arguments_dic["database-description"]

        # Log dictionary where all log information will be stored
        self.log = {}
        if self.log_file:
            self.log_file_fd = file(self.log_file, 'w')

        self.biana_access = BianaDBaccess(dbname=self.biana_dbname, dbhost=self.biana_dbhost, dbuser=self.biana_dbuser, use_buffer=True, dbpassword=self.biana_dbpass, lock_tables=True, check_integrity=True )


        # check data consistency

        # Time related
        self.initial_time = time.time()

        # Insert the information associated to the parsed database
        # Introduce database info into biana database
        #if( self.mode=="scratch" ):

        self.database = ExternalDatabase( databaseName = self.sourcedb_name,
                                          databaseVersion = self.sourcedb_version,
                                          databaseFile = self.input_file.split(os.sep)[-1],
                                          databaseDescription = self.database_description,
                                          defaultExternalEntityAttribute = self.default_eE_attribute,
					  isPromiscuous = self.is_promiscuous ) 
                                          #content_type_list = self.content_type_list)

        self.biana_access.insert_new_external_database( externalDatabase = self.database )
                                                               
        # Open the input file descriptor
        # This is a responsability of subclasses method
            
        try:
            if self.optimize_for_parsing:
            	self.biana_access.optimize_database_for(mode="parsing")

            self.parse_database()
            
            # set the parsing time
            self.database.set_parsing_time( int(time.time() - self.initial_time) )

            # Updates the information that this external database has inserted
            self.biana_access.update_external_database_external_entity_attributes( self.database )

            self.close()


        except:
            traceback.print_exc()
            sys.stderr.write("ERROR WHILE PARSING. ALL MODIFICATIONS ARE GOING TO BE DELETED\n")
            self.biana_access._rollback()
            sys.exit(1)