Ejemplo n.º 1
0
def search(query, api_key, limit=10):
    """Yield search results item by item"""
    page_size = _page_size(limit)
    dpla = DPLA(api_key)
    page = 0
    yielded = 0
    while yielded < limit:
        page += 1
        result = dpla.search(query, page_size=page_size, page=page,
                             fields=['sourceResource'])
        if len(result.items):
            for item in result.items:
                yield item
                yielded += 1
        else:
            raise StopIteration
Ejemplo n.º 2
0
def search(query, api_key, limit=10):
    """Yield search results item by item"""
    page_size = _page_size(limit)
    dpla = DPLA(api_key)
    page = 0
    yielded = 0
    while yielded < limit:
        page += 1
        result = dpla.search(query,
                             page_size=page_size,
                             page=page,
                             fields=['sourceResource'])
        if len(result.items):
            for item in result.items:
                yield item
                yielded += 1
        else:
            raise StopIteration
Ejemplo n.º 3
0
    def on_status(self, status):

        # get data from tweet

        querier = status.author.screen_name
        twt = status.text
        twt_term = twt.replace("@QueryDPLA ", "")

        # search DPLA

        dpla = DPLA('xxxxx')  # your DPLA API key
        result = dpla.search(q="%s" % twt_term,
                             fields=["sourceResource.title", "id"],
                             page_size=50)

        # pick random result

        total = result.count

        if total > 49:
            json_data = result.items[random.randint(0, 49)]
        elif total in range(1, 50):
            json_data = result.items[random.randint(0, (total - 1))]
        else:
            pass

        # tweet DPLA metadata at querier

        if total >= 1:
            title = json_data['sourceResource.title']
            json_id = json_data['id']
            item_url = 'http://dp.la/item/%s' % json_id

            api.update_status(".@%s %s %s" % (querier, title[:80], item_url),
                              in_reply_to_status_id=status.id)
        else:
            api.update_status(".@%s No items found. Try another term!" %
                              querier,
                              in_reply_to_status_id=status.id)
Ejemplo n.º 4
0
    def on_status(self, status):
        
        # get data from tweet
        
        querier = status.author.screen_name
        twt = status.text
        twt_term = twt.replace("@QueryDPLA ", "")
            
        # search DPLA

        dpla = DPLA('xxxxx') # your DPLA API key
        result = dpla.search(q="%s" % twt_term, fields=["sourceResource.title", "id"],
        page_size=50)
            
        # pick random result
        
        total = result.count
        
        if total > 49:
            json_data = result.items[random.randint(0, 49)]
        elif total in range (1, 50):
            json_data = result.items[random.randint(0, (total - 1))]
        else:
            pass
        	
        # tweet DPLA metadata at querier
        
        if total >= 1:    
            title = json_data['sourceResource.title']
            json_id = json_data['id']
            item_url = 'http://dp.la/item/%s' % json_id
        	
            api.update_status(".@%s %s %s" % (querier, title[:80], item_url), 
            in_reply_to_status_id = status.id)
        else:
            api.update_status(".@%s No items found. Try another term!" % querier,
            in_reply_to_status_id = status.id)           
Ejemplo n.º 5
0
from dpla.api import DPLA
#how to access your 'API_KEY' so you don't have to enter every time
#os lets you look at your computer
import os
#pull your API_KEY
my_api_key = '6fe82fec460e727f153f50b9e6b28e07'
#open a connection with the DPLA API.
dpla_connection = DPLA(my_api_key)

#use requests library
import requests

#add an endpoint where we can make requests
endpoint = 'https://api.dp.la/v2/items'
#set parameters to get information about Austi, Texas
params = {
    'api_key': my_api_key,
    'q': "Austin, Texas",
}
#let's get the requests back!
requested_the_hard_way = requests.get(endpoint, params)
requested_the_hard_way.status_code
#print the URL
print("****")
print(requested_the_hard_way.url)
#print the number of results
print("****")
print(requested_the_hard_way.json()['count'])
#print the 'docs'(?) for the first page of results
print("****")
print(requested_the_hard_way.json()['docs'][0])
Ejemplo n.º 6
0
from dpla.api import DPLA

dpla = DPLA('your_key_here')

fields = {"sourceResource.type": "image"}

search_query = '"fourth of july" OR "independence day" OR "July 4th" OR "July Fourth"'

result = dpla.search(search_query, searchFields=fields, page_size=10000)

print(result.count)

with open('ids.csv', "w") as f:
    for x in result.all_records():
        f.write(x['id'] + "\n")
Ejemplo n.º 7
0
from dpla.api import DPLA
import os
my_api_key = os.getenv('6fe82fec460e727f153f50b9e6b28e07')
dpla_connection = DPLA(my_api_key)
result = dpla_connection.search('cats')
item = result.items[2]
print(item)
Ejemplo n.º 8
0
import tweepy, random
from dpla.api import DPLA 
from credentials import *

#authenticate with Twitter api using credentials.py
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth)

#create DPLA object using dpla module and your API key
dpla = DPLA(DPLA_KEY)

def send_tweet():
  #generate random number to use as page value
  random_page = random.randint(0, 100)
  
  #create results set of all DPLA items where provider is SSDN 
  fields = {"provider" : "Sunshine State Digital Network"}
  result = dpla.search(searchFields=fields, page_size=100, page=random_page)

  #get random item from the results
  items = random.sample(result.items, 1)

  #print the id of the random item to the console - used for testing
  #print(items[0]["id"])

  #extract element from the record to use in tweet
  for item in items:
  #determine if description field is present, and compose tweety accordingly
    if "description" in item["sourceResource"]:
      url = "https://dp.la/item/" + item["id"]
Ejemplo n.º 9
0
from dpla.api import DPLA
from unscroll import UnscrollClient
import pprint
import datefinder

dpla = DPLA('d858a6fc387cfe9eebf702fca43c9798')
result = dpla.search(q="raccoon", page_size=10)

i = 0
for item in result.all_records():
    i = i + 1
    if i > 20:
        break
    r = item
    pprint.pprint(r)

    dt = None
    date = r.get('date')
    if date is not None and type(date) is list:
        date = date[0]
        print('XXXXX', date)
    if date is not None:
        disp_date = date.get('displayDate')
        if disp_date is not None:
            print('DISP_DATE', disp_date)
            dt = None
            dates = list(datefinder.find_dates(disp_date))
            if len(dates) > 0:
                dt = dates[0]
    print('DTDT', dt)
    if dt is not None:
Ejemplo n.º 10
0
import os
from dpla.api import DPLA

my_api_key = os.getenv('API_KEY')
dpla_connection = DPLA(my_api_key)

result = dpla_connection.search('austen')
print(type(result))

print(str(result.__dict__)[:1000])

item = result.items[0]
item


item['sourceResource']['stateLocatedIn']

result.items[0]['sourceResource']
Ejemplo n.º 11
0
 def __init__(self):
     """Load DPLA key and establish connection."""
     self.dpla_key = self.__load_dpla_key()
     self.dpla = DPLA(self.dpla_key)
     self.result = None
     self.metadata_records = []
Ejemplo n.º 12
0
class DplaApi():
    """Interact with DPLA API using pre-acquired key."""

    def __init__(self):
        """Load DPLA key and establish connection."""
        self.dpla_key = self.__load_dpla_key()
        self.dpla = DPLA(self.dpla_key)
        self.result = None
        self.metadata_records = []

    def search(self, q_value, page_size=100):
        """Run basic search query across DPLA.

        args:
            q_value (str) -- value to search
        kwargs:
            page_size (int) -- max number of results to request from DPLA.
            (DPLA-imposed limit is 500)
        """
        self.query = q_value.strip().replace(",", "").replace("(", "").replace(")", "")
        self.result = self.dpla.search(q=q_value, page_size=page_size)
        print "Query: '{0}' returned {1} results".format(self.query, self.result.count)
        time.sleep(.5)
        self.all_returned_items = self.result.items
        if self.result.count > self.result.limit:
            pages = int(math.ceil(self.result.count / self.result.limit))
            for page in range(2, pages + 1):
                print "----Accessing results page {0}".format(page)
                self.result = self.dpla.search(q=q_value, page_size=page_size, page=page)
                self.all_returned_items += self.result.items
                time.sleep(.5)

    def build_arc_rdf_dataset(self, check_match=True, disciplines=""):
        """Iterate over search results and pull necessary elements to create ARC RDF.

        Store results in a list of python dictionaries.
        kwargs:
            check_match(bool): check if RDF for item has already been created.
            disciplines(str): string of "|"-separated values.
        """
        self.disciplines = disciplines
        if check_match:
            self.__load_match_data()

        print "----Check: {0} records transferred"\
              .format(len(self.all_returned_items))
        rdf_matches = 0
        new_records = 0
        for item in self.all_returned_items:
            if check_match:
                if os.path.basename(item["@id"]) in self.existing_records:
                    rdf_matches += 1
                else:
                    self.__process_metadata(item)
                    self.existing_records.append(item["@id"])
                    new_records += 1

            else:
                self.__process_metadata(item)

        print "----Found: {0} existing RDF records".format(rdf_matches)
        print "----Saved: {0} new metadata records".format(new_records)
        self._store_match_data()

    def create_tsv(self, records=None,
                   output_path="data/radicalism-dpla-201603.tsv"):
        """Build TSV file for ARC pre-RDF dataset.

        kwargs:
            records(list): by default, the results created from the current search.
        """
        if not records:
            records = self.metadata_records
        tsv_lines = []
        headings = "\t".join(records[0].keys())
        headings += "\n"
        tsv_lines.append(headings)
        for record in records:
            line = ""
            for key, value in record.items():
                if isinstance(value, list):
                    line += " | ".join([v.replace("\t", " ") for v in value if v is not None])
                else:
                    line += value.replace("\t", " ")
                line += "\t"
            line += "\n"
            tsv_lines.append(line)

        with codecs.open(output_path, "w", "UTF-8") as output_file:
            for line in tsv_lines:
                output_file.write(line)

        print "Completed writing {0}".format(output_path)

    def update_rdf_registry(self, rdf_dir="rdf", reset_matches=False):
        """Update listings of already-processed items.

        kwargs:
            rdf_dir(str): directory in which to find rdf.
            reset_matches(bool): reset match list and rebuild from scratch,
                based entirely on RDF files present in specified dir. Any
                records added to the registry through querying will be removed.
        """
        self.__load_match_data(reset_match_file=reset_matches)
        update_count = 0
        match_count = 0
        for root, dirs, files in os.walk(rdf_dir):
            for f in files:
                if f.endswith(".xml"):
                    root_name = os.path.splitext(f)[0]
                    if root_name not in self.existing_records:
                        self.existing_records.append(root_name)
                        update_count += 1
                    else:
                        match_count += 1
        print "Matching records: {0}".format(match_count)
        print "New records: {0}".format(update_count)
        self._store_match_data()

    def _store_match_data(self):
        """Store updated processed item list."""
        with open(self.match_file, "w") as match_file:
            json.dump(self.existing_records, match_file)

    def __load_dpla_key(self):
        """Load DPLA API Key from config file."""
        config = ConfigParser.RawConfigParser()
        config.read("default.cfg")
        return config.get("dpla_api", "api_key")

    def __load_match_data(self, reset_match_file=False):
        """Prepare data on previous search results."""
        self.match_file = self.__load_match_settings()
        if reset_match_file is True:
            self.existing_records = []
        else:
            self.existing_records = json.load(open(self.match_file, "r"))

    def __load_match_settings(self):
        """Load file containing list of all previously processed items."""
        config = ConfigParser.RawConfigParser()
        config.read("default.cfg")
        return config.get("check_match", "match_file")

    def __process_metadata(self, item):
        """Process metadata from JSON DPLA record, pulling descriptive
        elements from the 'sourceResource' fields.

        Positional arguments:
        item (dict) -- Python dictionary from JSON results of DPLA search.
        """
        d_metadata = DplaMetadata(item["sourceResource"])
        d_metadata.compile()
        d_metadata.record["thumbnail"] = item.get("object", "")
        d_metadata.record["seeAlso"] = item.get("isShownAt", "")
        if "provider" in item:
            d_metadata.record["source"] = item["provider"]["name"]
        elif "dataProvider" in item:
            d_metadata.record["source"] = item["dataProvider"]
        else:
            d_metadata.record["source"] = ""
        d_metadata.record["discipline"] = self.disciplines
        d_metadata.record["genre"] = self._get_genre_from_marc(item)
        d_metadata.record["archive"] = ""
        d_metadata.record["role"] = ""
        d_metadata.record["federation"] = "SiRO"
        d_metadata.record["original_query"] = self.query
        d_metadata.record["id"] = item["@id"]
        self.metadata_records.append(d_metadata.record)

    def _get_genre_from_marc(self, item):
        """Check for 'literary form' value in MARC record.

        args:
            item (dict): Python dictionary from JSON results of DPLA search.
        returns:
            (str) genre value(s) to include in output.
        """
        genre_value = "none"
        value_map = {"0": "Nonfiction",
                     "1": "Fiction",
                     "d": "Drama",
                     "e": "Nonfiction",
                     "f": "Fiction",
                     "i": "Correspondence",
                     "j": "Fiction",
                     "p": "Poetry"
                     }
        """
        This method could be used with DPLA-returned MARC info, if the spacing
        of the 008 field was preserved.
        if self._marc_record(item):
            for field in item["originalRecord"]["controlfield"]:
                if field["tag"] == "008" and len(field["#text"]) > 33:
                    genre = field["#text"][33]
                    genre_value = value_map.get(genre, "")
        """
        # Access bibliographic information via the HathiTrust API.
        if "hathitrust" in item.get("isShownAt", ""):
            record = self._get_hathi_record(item)
            # print record
            marc_string = record["records"][str(self.hathi_id)]["marc-xml"]
            genre = self._extract_genre(marc_string)
            genre_value = value_map.get(genre, "")

        return genre_value

    def _extract_genre(self, marc_string):
        """Extract appropriate byte-mark in 008 to indicate genre.

        args:
            marc_string(str): marc xml as string.
        """
        path_008 = "/collection/record/controlfield[@tag='008']"
        tree = etree.fromstring(marc_string.encode("utf-8"))
        text_008 = tree.xpath(path_008)[0].text
        if len(text_008) > 33:
            genre = text_008[33]
        else:
            genre = "null"
        return genre

    def _get_hathi_record(self, item):
        """Get HathiTrust record.

        args:
            item (dict): Python dictionary from JSON results of DPLA search.
        """
        self.hathi_id = item["originalRecord"]["_id"]
        hbi = HathiBibApi()
        return hbi.get_record(self.hathi_id)

    def _marc_record(self, item):
        """Check if item contains a MARC record.

        args:
            item(dict): Python dictionary from JSON results of DPLA search.
        returns:
            is_marc(bool): true if item contains a marc record, false otherwise.
        """
        is_marc = False
        if "originalRecord" in item:
            if "controlfield" in item["originalRecord"]:
                is_marc = True
        return is_marc

    # Not using these anymore, might not work.
    def return_html(self, filepath="dpla.html"):
        with open(filepath, "w") as f:
            f.write("<table border='1' style='100%'>")
            f.write("<tr>")
            f.write("<td>Title</td>")
            f.write("<td>Link</td>")
            f.write("</tr>")
            for item in result.items:
                title = item["sourceResource"]["title"]
                url = unicode(item["isShownAt"])
                if isinstance(title, list):
                    title_clean = unicode(title[0])

                else:
                    title_clean = unicode(title)

                f.write("<tr>")
                f.write("<td>"+title_clean.encode("UTF-8")+"</td>")
                f.write("<td><a href='"+url+"'>"+url+"</td>")
                f.write("<tr>")
            f.write("</table>")

    def return_marcxml(self):
        if self.result is None:
            print "Error -- No data available -- Run search first"
            return

        for item in self.items:
            json_record = item["originalRecord"]
            xml = self.build_marcxml_record(json_record)
            if xml is not None:
                with open(os.path.join(output_path, str(item["id"]))+".xml", "w") as f:
                    f.write(etree.tostring(xml, encoding="utf-8", xml_declaration=True))
                xml_processed += 1
            else:
                errors += 1

        print "{0} Errors Returned".format(errors)
        print "{0} XML Files Written".format(xml_processed)
        print "{0} Non Standard Records".format(self.non_standard_records)
        print "{0} MARC Namespace Records".format(self.marc_namespace)

    def build_marcxml_record(self, json_record):

        root = None
        if "leader" not in json_record and "metadata" not in json_record:
            self.non_standard_records += 1
            #print "\n================Non Standard Record===============\n"
            #print json_record
            return None

        elif "metadata" in json_record:

            if "marc:record" in json_record["metadata"]:
                self.marc_namespace += 1
                json_record = json_record["metadata"]["marc:record"] 
                try:
                    root = etree.Element("record", xmlns="http://www.loc.gov/MARC21/slim")
                    leader = etree.SubElement(root, "leader")
                    leader.text = json_record["marc:leader"]
                    for c in json_record["marc:controlfield"]:
                        controlfield = etree.SubElement(root, "controlfield", tag=c["tag"])
                        controlfield.text = c["#text"]
                    for d in json_record["marc:datafield"]:
                        datafield = etree.SubElement(root, "datafield", tag=d["tag"], ind1=d["ind1"], ind2=d["ind2"])
                        # When there's only one subfield, d["subfield"] will return a dict. Otherwise, a list of dicts.
                        if isinstance(d["marc:subfield"], dict):
                            subfield = etree.SubElement(datafield, "subfield", code=d["marc:subfield"]["code"])
                            subfield.text = d["marc:subfield"]["#text"]
                        else:
                            for s in d["marc:subfield"]:
                                subfield = etree.SubElement(datafield, "subfield", code=s["code"])
                                subfield.text = s["#text"]
                except Exception as e:
                    
                    print "\n================Error In Record===============\n"
                    print e
                    print json_record
                    return None

        elif "leader" in json_record:

            try:
                root = etree.Element("record", xmlns="http://www.loc.gov/MARC21/slim")
                leader = etree.SubElement(root, "leader")
                leader.text = json_record["leader"]
                for c in json_record["controlfield"]:
                    controlfield = etree.SubElement(root, "controlfield", tag=c["tag"])
                    controlfield.text = c["#text"]
                for d in json_record["datafield"]:
                    datafield = etree.SubElement(root, "datafield", tag=d["tag"], ind1=d["ind1"], ind2=d["ind2"])
                    # When there's only one subfield, d["subfield"] will return a dict. Otherwise, a list of dicts.
                    if isinstance(d["subfield"], dict):
                        subfield = etree.SubElement(datafield, "subfield", code=d["subfield"]["code"])
                        subfield.text = d["subfield"]["#text"]
                    else:
                        for s in d["subfield"]:
                            subfield = etree.SubElement(datafield, "subfield", code=s["code"])
                            subfield.text = s["#text"]
            except Exception as e:
                
                print "\n================Error In Record===============\n"
                print e
                print json_record
                return None

        return root
        """