Example #1
0
 def make_sort_trinomial(self, trinomial):
     """ make a sortable trinomial """
     tri_man = TrinomialManage()
     tri_parts = tri_man.parse_trinomial(trinomial)
     prepended_site = self.prepend_site_zeros(tri_parts['site'], 5)
     return str(tri_parts['state']) + str(
         tri_parts['county']) + prepended_site
Example #2
0
 def find_trinomials_in_metadata(self, xml):
     """ finds trimomials in metadata """
     recs = xml.xpath('//oai:record', namespaces=self.namespaces)
     print('Number of records in XML: ' + str(len(recs)))
     for rec in recs:
         tri_dict = {
             'rec_uri': None,
             'title': None,
             'citation_html': None,
             'source_label': 'Index of Texas Archaeology: Open Access Gray Literature from the Lone Star State',
             'source_uri': 'http://scholarworks.sfasu.edu/ita',
             'request_url': self.request_url,
             'trinomials' : []
         }
         idents = rec.xpath('oai:metadata/oai_dc:dc/dc:identifier',
                            namespaces=self.namespaces)
         rec_uri = None
         for ident_dom in idents:
             act_uri = ident_dom.text
             if rec_uri is None:
                 rec_uri = act_uri
             if 'viewcontent.cgi' not in act_uri:
                 rec_uri = act_uri
         tri_dict['rec_uri'] = rec_uri
         titles = rec.xpath('oai:metadata/oai_dc:dc/dc:title',
                            namespaces=self.namespaces)
         for title_dom in titles:
             tri_dict['title'] = title_dom.text
             tri_dict['trinomials'] = self.extract_texas_trinomials_from_text(title_dom.text,
                                                                              tri_dict['trinomials'])
         descripts = rec.xpath('oai:metadata/oai_dc:dc/dc:description',
                               namespaces=self.namespaces)
         for description_dom in descripts:
             tri_dict['trinomials'] = self.extract_texas_trinomials_from_text(description_dom.text,
                                                                              tri_dict['trinomials'])
         subjects = rec.xpath('oai:metadata/oai_dc:dc/dc:subject',
                              namespaces=self.namespaces)
         for subject_dom in subjects:
             tri_dict['trinomials'] = self.extract_texas_trinomials_from_text(subject_dom.text,
                                                                              tri_dict['trinomials'])
         tri_dict['citation_html'] = self.make_citation_html(rec)
         print('-----------------------------------')
         print(tri_dict['citation_html'])
         print('Trinomials: ' + str(tri_dict['trinomials']))
         print('-----------------------------------')
         if len(tri_dict['trinomials']) > 0:
             # we found trinomials! So add to our list of publications with trinomials
             self.trinomial_refs.append(tri_dict)
             for trinomial in tri_dict['trinomials']:
                 if trinomial not in self.unique_trinomials:
                     self.unique_trinomials.append(trinomial)
                     # now add the trinomialstate and county code to the list
                     # of unique state and county codes if it is new
                     # fist, we parse the trinomial to find the state and county parts
                     tri_m = TrinomialManage()
                     tri_p = tri_m.parse_trinomial(trinomial)
                     state_county = tri_p['state'] + tri_p['county']
                     if state_county not in self.unique_state_counties:
                         # we have a new state and county code, so add it to the list
                         self.unique_state_counties.append(state_county)
Example #3
0
 def validate_token_trinomial(self, token):
     """ checks to see if a token is a trinomial """
     # make the token upper case, so all county codes are upper case, if this
     # token happens to be a trinomial
     token = token.upper()
     ok = False
     trinomial = None
     tri_m = TrinomialManage()
     try:
         tri_p = tri_m.parse_trinomial(token)
     except:
         # could not parse as a trinomial
         # so it's not a trinomial
         tri_p = None
     if isinstance(tri_p, dict):
         ok = True
         # make sure the county part is only letters
         tri_p['county'] = re.sub('[^a-zA-Z]+', '', tri_p['county'])
         # now validate different parts of the trinomial
         if len(tri_p['state']) < 1 or len(tri_p['state']) > 2:
             # the state code has the wrong length not 1 or 2 characters
             ok = False
         else:
             # state length is OK, but check if it is an integer
             state_is_int = True
             try:
                 state_int = int(float(tri_p['state']))
                 state_is_int = True
             except:
                 state_is_int = False
             if state_is_int is False:
                 ok = False
             if isinstance(self.only_valid_state_id, str):
                 # we need to validate by an allowed state code
                 if tri_p['state'] != self.only_valid_state_id:
                     # the state part of the trinomial is not
                     # the allowed state id
                     ok = False
         if len(tri_p['county']) != 2:
             # county part of trinomial is the wrong length
             ok = False
         if len(tri_p['site']) < 1:
             # can't have a blank site number
             ok = False
         else:
             # site number is not blank, but check if it is an integer
             site_is_int = True
             try:
                 state_int = int(float(tri_p['state']))
                 state_is_int = True
             except:
                 state_is_int = False
             if state_is_int is False:
                 # not an integer so not a good trinomial
                 ok = False
         if ok:
             # now put together the trinomial parts into a well formated full trinomial
             trinomial = tri_p['state'] + tri_p['county'] + tri_p['site']
     return trinomial
Example #4
0
 def validate_token_trinomial(self, token):
     """ checks to see if a token is a trinomial """
     # make the token upper case, so all county codes are upper case, if this
     # token happens to be a trinomial
     token = token.upper()
     ok = False
     trinomial = None
     tri_m = TrinomialManage()
     try:
         tri_p = tri_m.parse_trinomial(token)
     except:
         # could not parse as a trinomial
         # so it's not a trinomial
         tri_p = None
     if isinstance(tri_p, dict):
         ok = True
         # make sure the county part is only letters
         tri_p['county'] = re.sub('[^a-zA-Z]+', '', tri_p['county'])
         # now validate different parts of the trinomial
         if len(tri_p['state']) < 1 or len(tri_p['state']) > 2:
             # the state code has the wrong length not 1 or 2 characters
             ok = False
         else:
             # state length is OK, but check if it is an integer
             state_is_int = True
             try:
                 state_int = int(float(tri_p['state']))
                 state_is_int = True
             except:
                 state_is_int = False
             if state_is_int is False:
                 ok = False
             if isinstance(self.only_valid_state_id, str):
                 # we need to validate by an allowed state code
                 if tri_p['state'] != self.only_valid_state_id:
                     # the state part of the trinomial is not
                     # the allowed state id
                     ok = False
         if len(tri_p['county']) != 2:
             # county part of trinomial is the wrong length
             ok = False
         if len(tri_p['site']) < 1:
             # can't have a blank site number
             ok = False
         else:
             # site number is not blank, but check if it is an integer
             site_is_int = True
             try:
                 state_int = int(float(tri_p['state']))
                 state_is_int = True
             except:
                 state_is_int = False
             if state_is_int is False:
                 # not an integer so not a good trinomial
                 ok = False
         if ok:
             # now put together the trinomial parts into a well formated full trinomial
             trinomial = tri_p['state'] + tri_p['county'] + tri_p['site']
     return trinomial
Example #5
0
def make_trinomial_instances_df(doc_dir):
    tri_man = TrinomialManage()
    tri_man.remove_prepended_zeros = True
    df = pd.DataFrame(
        columns=[
            'filename',
            'pos_trinomial',
            'state_num',
            'region_abbr',
            'site_number'
        ]
    )
    i = 0
    for subdir, dirs, files in os.walk(doc_dir):
        for file in files:
            if not file.endswith('.txt'):
                continue
            filepath = os.path.join(subdir, file)
            with open(filepath, 'r') as file_obj:
                content = file_obj.read()
            trinomials = re.findall(r'(\b([0-9]{1,2}[A-Z]{2,}[0-9]{1,})\b)', content)
            trinomials = set(trinomials)
            for t_tup in trinomials:
                t_tup = set(t_tup)
                for trinomial in t_tup:
                    if trinomial.startswith('0'):
                        # not a trinomial
                        continue
                    tri_parts = tri_man.parse_trinomial(trinomial)
                    state = int(tri_parts['state'])
                    if state < 1 or state > 50:
                        # not a state, skip
                        continue
                    df.loc[i] = [
                        file,
                        trinomial,
                        state,
                        tri_parts['county'],
                        tri_parts['site']
                    ]
                    i += 1
                    print('[{}] Found {} in {} ({}, {}, {})'.format(
                            i,
                            trinomial,
                            file,
                            state,
                            tri_parts['county'],
                            tri_parts['site'],
                        )
                    )
    return df
Example #6
0
 def make_aux_trinomial_list(self, trinomial):
     """ makes a list of auxiliary, non standard
         trinomials
     """
     aux_tris = []
     # get a dictionary for the different parts of the trinomial
     tri_man = TrinomialManage()
     tri_parts = tri_man.parse_trinomial(trinomial)
     #add a - seperator between parts
     aux_tris.append(self.join_parts('-', tri_parts))
     p_tri_parts = tri_parts
     p_tri_parts['site'] = self.prepend_site_zeros(p_tri_parts['site'], 5)
     aux_tris.append(self.join_parts('-', p_tri_parts))
     return aux_tris
Example #7
0
 def make_aux_trinomial_list(self, trinomial):
     """ makes a list of auxiliary, non standard
         trinomials
     """
     aux_tris = []
     # get a dictionary for the different parts of the trinomial
     tri_man = TrinomialManage()
     tri_parts = tri_man.parse_trinomial(trinomial)
     #add a - seperator between parts
     aux_tris.append(self.join_parts('-', tri_parts))
     p_tri_parts = tri_parts
     p_tri_parts['site'] = self.prepend_site_zeros(p_tri_parts['site'], 5)
     aux_tris.append(self.join_parts('-', p_tri_parts))
     return aux_tris
Example #8
0
 def make_sort_trinomial(self, trinomial):
     """ make a sortable trinomial """
     tri_man = TrinomialManage()
     tri_parts = tri_man.parse_trinomial(trinomial)
     prepended_site = self.prepend_site_zeros(tri_parts['site'], 5)
     return str(tri_parts['state']) + str(tri_parts['county']) + prepended_site
Example #9
0
 def match_trinomial_obj(self, tri):
     """ Attempts to match a trinomial object 'tri'
         against tDAR, if it hasn't yet been matched
     """
     found_matches = 0
     manifest = False
     try:
         manifest = Manifest.objects.get(uuid=tri.uuid)
     except Manifest.DoesNotExist:
         manifest = False
     la_check = LinkAnnotation.objects\
                              .filter(subject=tri.uuid,
                                      predicate_uri='dc-terms:subject',
                                      object_uri__contains=self.TDAR_VOCAB)[:1]
     if len(la_check) < 1 and manifest is not False:
         # we don't already have a tDAR id for this item, continue with matches
         tri_man = TrinomialManage()
         request_keywords = [tri.trinomial]
         if self.lead_zero_check:
             # check multiple leading zeros
             tri_parts = tri_man.parse_trinomial(tri.trinomial)
             site = tri_parts['site']
             site_part_len = len(site)
             while len(site) < 4:
                 site = '0' + site
                 new_trinomial = tri_parts['state'] + tri_parts[
                     'county'] + site
                 request_keywords.append(new_trinomial)
         for keyword in request_keywords:
             tdar_api = tdarAPI()
             results = tdar_api.get_site_keyword(keyword)
             if isinstance(results, list):
                 for result in results[:self.max_results]:
                     # assume it is a spurious match
                     match_real = False
                     if result['label'] == tri.trinomial:
                         # the trinomial and the tDAR result exactly match
                         match_real = True
                     else:
                         # check if the only difference is in leading zeros
                         tri_parts = tri_man.parse_trinomial(tri.trinomial)
                         site = tri_parts['site']
                         site_part_len = len(site)
                         while len(site) < 5:
                             site = '0' + site
                             new_trinomial = tri_parts['state'] + tri_parts[
                                 'county'] + site
                             if new_trinomial == result['label']:
                                 # A good match, the tDAR result and the trinomial
                                 # match (but with different leading zeros)
                                 match_real = True
                     if match_real:
                         found_matches += 1
                         # OK! Found a match, first save the linked entity in the link entity table
                         le_check = False
                         try:
                             le_check = LinkEntity.objects.get(
                                 uri=result['id'])
                         except LinkEntity.DoesNotExist:
                             le_check = False
                         if le_check is False:
                             le = LinkEntity()
                             le.uri = result['id']
                             le.label = result['label']
                             le.alt_label = result['label']
                             le.vocab_uri = self.TDAR_VOCAB
                             le.ent_type = 'type'
                             le.save()
                         # Now save the link annotation
                         la = LinkAnnotation()
                         la.subject = tri.uuid
                         la.subject_type = manifest.item_type
                         la.project_uuid = manifest.project_uuid
                         la.source_id = 'tdar-api-lookup'
                         la.predicate_uri = self.DC_TERMS_SUBJECT
                         la.object_uri = result['id']
                         la.save()
                     else:
                         print('Almost! ' + result['label'] +
                               ' is not exactly: ' + tri.trinomial)
             if tdar_api.request_error:
                 self.request_error = True
                 print('HTTP request to tDAR failed!')
                 self.error_wait += self.base_wait
                 if self.error_wait > self.max_wait:
                     print('Too many failures, quiting...')
                     sys.exit('Quitting process')
                 else:
                     # sleep some minutes before trying again
                     print('Will try again in ' + str(self.error_wait) +
                           ' seconds...')
                     sleep(self.error_wait)
             else:
                 self.request_error = False
                 if self.error_wait >= self.base_wait:
                     print('HTTP requests resumed OK, will continue.')
                     self.error_wait = 0
     return found_matches
Example #10
0
 def match_trinomial_obj(self, tri):
     """ Attempts to match a trinomial object 'tri'
         against tDAR, if it hasn't yet been matched
     """
     found_matches = 0
     manifest = False
     try:
         manifest = Manifest.objects.get(uuid=tri.uuid)
     except Manifest.DoesNotExist:
         manifest = False
     la_check = LinkAnnotation.objects\
                              .filter(subject=tri.uuid,
                                      predicate_uri='dc-terms:subject',
                                      object_uri__contains=self.TDAR_VOCAB)[:1]
     if len(la_check) < 1 and manifest is not False:
         # we don't already have a tDAR id for this item, continue with matches
         tri_man = TrinomialManage()
         request_keywords = [tri.trinomial]
         if self.lead_zero_check:
             # check multiple leading zeros
             tri_parts = tri_man.parse_trinomial(tri.trinomial)
             site = tri_parts['site']
             site_part_len = len(site)
             while len(site) < 4:
                 site = '0' + site
                 new_trinomial = tri_parts['state'] + tri_parts['county'] + site
                 request_keywords.append(new_trinomial)
         for keyword in request_keywords:
             tdar_api = tdarAPI()
             results = tdar_api.get_site_keyword(keyword)
             if isinstance(results, list):
                 for result in results[:self.max_results]:
                     # assume it is a spurious match
                     match_real = False
                     if result['label'] == tri.trinomial:
                         # the trinomial and the tDAR result exactly match
                         match_real = True
                     else:
                         # check if the only difference is in leading zeros
                         tri_parts = tri_man.parse_trinomial(tri.trinomial)
                         site = tri_parts['site']
                         site_part_len = len(site)
                         while len(site) < 5:
                             site = '0' + site
                             new_trinomial = tri_parts['state'] + tri_parts['county'] + site
                             if new_trinomial == result['label']:
                                 # A good match, the tDAR result and the trinomial
                                 # match (but with different leading zeros)
                                 match_real = True
                     if match_real:
                         found_matches += 1
                         # OK! Found a match, first save the linked entity in the link entity table
                         le_check = False
                         try:
                             le_check = LinkEntity.objects.get(uri=result['id'])
                         except LinkEntity.DoesNotExist:
                             le_check = False
                         if le_check is False:
                             le = LinkEntity()
                             le.uri = result['id']
                             le.label = result['label']
                             le.alt_label = result['label']
                             le.vocab_uri = self.TDAR_VOCAB
                             le.ent_type = 'type'
                             le.save()
                         # Now save the link annotation
                         la = LinkAnnotation()
                         la.subject = tri.uuid
                         la.subject_type = manifest.item_type
                         la.project_uuid = manifest.project_uuid
                         la.source_id = 'tdar-api-lookup'
                         la.predicate_uri = self.DC_TERMS_SUBJECT
                         la.object_uri = result['id']
                         la.save()
                     else:
                         print('Almost! ' + result['label'] + ' is not exactly: ' + tri.trinomial)
             if tdar_api.request_error:
                 self.request_error = True
                 print('HTTP request to tDAR failed!')
                 self.error_wait += self.base_wait
                 if self.error_wait > self.max_wait:
                     print('Too many failures, quiting...')
                     sys.exit('Quitting process')
                 else:
                     # sleep some minutes before trying again
                     print('Will try again in ' + str(self.error_wait) + ' seconds...')
                     sleep(self.error_wait)
             else:
                 self.request_error = False
                 if self.error_wait >= self.base_wait:
                     print('HTTP requests resumed OK, will continue.')
                     self.error_wait = 0
     return found_matches
Example #11
0
 def find_trinomials_in_metadata(self, xml):
     """ finds trimomials in metadata """
     recs = xml.xpath('//oai:record', namespaces=self.namespaces)
     print('Number of records in XML: ' + str(len(recs)))
     for rec in recs:
         tri_dict = {
             'rec_uri': None,
             'title': None,
             'citation_html': None,
             'source_label':
             'Index of Texas Archaeology: Open Access Gray Literature from the Lone Star State',
             'source_uri': 'http://scholarworks.sfasu.edu/ita',
             'request_url': self.request_url,
             'trinomials': []
         }
         idents = rec.xpath('oai:metadata/oai_dc:dc/dc:identifier',
                            namespaces=self.namespaces)
         rec_uri = None
         for ident_dom in idents:
             act_uri = ident_dom.text
             if rec_uri is None:
                 rec_uri = act_uri
             if 'viewcontent.cgi' not in act_uri:
                 rec_uri = act_uri
         tri_dict['rec_uri'] = rec_uri
         titles = rec.xpath('oai:metadata/oai_dc:dc/dc:title',
                            namespaces=self.namespaces)
         for title_dom in titles:
             tri_dict['title'] = title_dom.text
             tri_dict[
                 'trinomials'] = self.extract_texas_trinomials_from_text(
                     title_dom.text, tri_dict['trinomials'])
         descripts = rec.xpath('oai:metadata/oai_dc:dc/dc:description',
                               namespaces=self.namespaces)
         for description_dom in descripts:
             tri_dict[
                 'trinomials'] = self.extract_texas_trinomials_from_text(
                     description_dom.text, tri_dict['trinomials'])
         subjects = rec.xpath('oai:metadata/oai_dc:dc/dc:subject',
                              namespaces=self.namespaces)
         for subject_dom in subjects:
             tri_dict[
                 'trinomials'] = self.extract_texas_trinomials_from_text(
                     subject_dom.text, tri_dict['trinomials'])
         tri_dict['citation_html'] = self.make_citation_html(rec)
         print('-----------------------------------')
         print(tri_dict['citation_html'])
         print('Trinomials: ' + str(tri_dict['trinomials']))
         print('-----------------------------------')
         if len(tri_dict['trinomials']) > 0:
             # we found trinomials! So add to our list of publications with trinomials
             self.trinomial_refs.append(tri_dict)
             for trinomial in tri_dict['trinomials']:
                 if trinomial not in self.unique_trinomials:
                     self.unique_trinomials.append(trinomial)
                     # now add the trinomialstate and county code to the list
                     # of unique state and county codes if it is new
                     # fist, we parse the trinomial to find the state and county parts
                     tri_m = TrinomialManage()
                     tri_p = tri_m.parse_trinomial(trinomial)
                     state_county = tri_p['state'] + tri_p['county']
                     if state_county not in self.unique_state_counties:
                         # we have a new state and county code, so add it to the list
                         self.unique_state_counties.append(state_county)