Beispiel #1
0
def fill_countries(file_path: Union[Path, str]) -> List[Country]:
    """Creates a list of Country objects from a .csv file."""
    rows = get_row(file_path)
    countries_list: List[Country] = []
    for row in rows:
        nullify(row)
        countries_list.append(Country(**row))
    return countries_list
Beispiel #2
0
def country_process() -> List[Country]:
    """Returns a list of Country objects to be stored in database."""
    rows = get_row(dbc.COUNTRIES_FILE)
    countries_list: List[Country] = []
    for row in rows:
        nullify(row)
        countries_list.append(Country(**row))

    return countries_list
Beispiel #3
0
def institution_process(countries: Dict[str, Country]) -> List[Institution]:
    rows = get_row(dbc.GRID_DATABASE_DIR / "institutes.csv")
    row_count = csv_size(dbc.GRID_DATABASE_DIR / "institutes.csv")

    attrs = ["addresses", "acronyms", "aliases", "labels", "links", "types"]
    # Group the GRID data tables by grid_id for better access.
    institution_attrs = [
        get_csv(dbc.GRID_DATABASE_DIR / f"{attr}.csv", "grid_id")
        for attr in attrs
    ]

    institutions_list: List[Institution] = []
    pbar = tqdm(total=row_count)
    for row in rows:
        nullify(row)
        # Get all the data related to the current institution.
        address, acronym, alias, label, link, type = [
            attr.get(row["grid_id"]) for attr in institution_attrs
        ]

        # Create 'soup' variable for fuzzy matching of institutions.
        soup = [row["name"]]

        if address:
            country = dbc.country_name_mapper(address[0].pop("country"))
            institution = Institution(**{**row, **address[0]})
            institution.country = countries[country]
            soup.append(country)
        else:
            institution = Institution(**row)

        if acronym:
            institution.acronyms = [Acronym(**i) for i in acronym]
            soup.extend(i["acronym"] for i in acronym)
        if alias:
            institution.aliases = [Alias(**i) for i in alias]
            soup.extend(i["alias"] for i in alias)
        if label:
            institution.labels = [Label(**i) for i in label]
            soup.extend(i["label"] for i in label)
        if link:
            institution.links = [Link(**i) for i in link]
        if type:
            institution.types = [Type(**i) for i in type]

        institution.soup = " | ".join(i for i in soup)
        institutions_list.append(institution)

        pbar.update()
    pbar.close()
    del institution_attrs  # Free-up memory (~ 10^5 institutions).

    return institutions_list
Beispiel #4
0
 def from_portfolio(cls, portfolio_row):
     u"""
     >>> cat = Category.from_portfolio({u'קטגוריה': u'Art and Design אמנות ועיצוב'})
     >>> cat.name_he == u'אמנות ועיצוב' and cat.name_en == 'Art and Design'
     True
     """
     from utils import has_hebrew_chars
     name_en = []
     name_he = []
     for word in portfolio_row[u'קטגוריה'].split(' '):
         if has_hebrew_chars(word):
             name_he.append(word)
         else:
             name_en.append(word)
     return Category.objects.get_or_create(name_he=nullify(' '.join(name_he)), defaults={'name_en': nullify(' '.join(name_en))})[0]
Beispiel #5
0
def ranking_process(
    db: Session, file_path: Union[Path, str], soup: Dict[str, Dict[str, str]]
) -> Tuple[List[Institution], List[Dict[str, str]], List[Dict[str, str]]]:
    """Matches institutions with their GRID database counterparts.

    The functuin reads a .csv file row-by-row and for each row, tries
    to match the institution to a GRID ID, using various criteria. If
    that happens with success, the function will then assigns different
    metrics in the ranking to that institution object.

    Args:
        db (Session): SQLAlchemy session instant to connect to the DB
        file_path (Union[Path, str]): The path to the ranking .csv file
        soup (Dict[str, Dict[str, str]]): A set of choices for matching
        institutions

    Returns:
        Tuple[List[Institution], List[Dict[str, str]], List[Dict[str, str]]]:
        Three lists: matched institution, not-matched institutions, and
        institutions that were matched using the fuzzywuzzy library.
    """
    institutions_list: List[Institution] = []
    not_mached_list: List[Dict[str, str]] = []
    fuzz_list: List[Dict[str, str]] = []

    # useful queries:
    q1 = db.query(Institution)
    q2 = q1.join(Institution.country)

    rows = get_row(file_path)
    row_count = csv_size(file_path)

    pbar = tqdm(total=row_count)
    for row in rows:
        pbar.update()
        nullify(row)
        link_type = row["Ranking System"]
        inst_name = row["Institution"].strip().lower()
        inst_country = dbc.country_name_mapper(row["Country"])
        inst_url = row["URL"]

        inst_info = {
            "Raw": inst_name,
            "Country": inst_country,
            "URL": inst_url,
            "Ranking System": row["Ranking System"],
            "Ranking Type": row["Ranking Type"],
            "Year": row["Year"],
            "Field": row["Field"],
            "Subject": row["Subject"],
        }

        # checking link with institution links
        inst: Institution = q1.join(Institution.links).filter(
            Link.link == inst_url, Link.type == link_type).first()

        # checking grid_id in manual matches
        if not inst and inst_country in dbc.MATCHES:
            match = dbc.MATCHES[inst_country].get(inst_name)
            if match:
                inst: Institution = q1.filter(
                    Institution.grid_id == match).first()

        # checking name with institution name
        if not inst:
            inst: Institution = q2.filter(
                func.lower(Institution.name) == inst_name,
                Country.country == inst_country,
            ).first()

        # fuzzy-mataching of strings
        if not inst:
            inst_grid_id = fuzzy_matcher(inst_name, inst_country, soup)
            if inst_grid_id:
                inst: Institution = q1.filter(
                    Institution.grid_id == inst_grid_id).first()
                fuzz_list.append({
                    "Fuzzy": inst.name,
                    "GRID ID": inst_grid_id,
                    **inst_info
                })

        # could not match, or was matched before (with another institution)
        if not inst or inst in institutions_list:
            not_mached_list.append({
                "Problem": "Not Matched" if not inst else f"Double: {inst}",
                **inst_info,
            })
            continue

        ranking_metrics = metrics_process(row)
        inst_link_types = [link.type.name for link in inst.links]
        if (link_type not in inst_link_types) and inst_url:
            inst.links.append(Link(type=link_type, link=inst_url))
        inst.rankings.extend(ranking_metrics)
        institutions_list.append(inst)

    pbar.close()

    return (institutions_list, not_mached_list, fuzz_list)
Beispiel #6
0
 def from_portfolio(cls, portfolio_row):
     return [(Subject.objects.get_or_create(name_he=nullify(subject.strip())))[0] for subject in portfolio_row[u'נושא'].split(',')]
Beispiel #7
0
 def from_portfolio(cls, portfolio_row):
     return Technique.objects.get_or_create(
         name_he=nullify(portfolio_row[u'טכניקה']),
         name_en=nullify(portfolio_row['Technique']))[0]
Beispiel #8
0
 def from_portfolio(cls, portfolio_row):
     return Client.objects.get_or_create(name_he=nullify(portfolio_row[u'לקוח']),
                                         name_en=nullify(portfolio_row['Client']))[0]
Beispiel #9
0
 def from_portfolio(cls, portfolio_row):
     return Country.objects.get_or_create(name_he=nullify(portfolio_row[u'ארץ']),
                                          name_en=nullify(portfolio_row[u'Country']))[0]